In [4]:
%load_ext dotenv
%dotenv 
import os
import sys
sys.path.append(os.getenv('SRC_DIR', '../../05_src'))
from utils.logger import get_logger
_logs = get_logger(__name__)

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [5]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
import os
from glob import glob

In [6]:
ft_dir = os.getenv("FEATURES_DATA", '../../05_src/data/feature')
ft_glob = glob(os.path.join(ft_dir, '**/*.parquet'), 
               recursive = True)
df = dd.read_parquet(ft_glob).compute().reset_index()

# Sampling in Python

+ There are different packages that allow sampling.
+ A practical approach is to use pandas/Dask sampling methods.

## Random Sampling

+ Sample n rows from a dataframe with [`df.sample()`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sample.html).

```
DataFrame.sample(
    n=None, frac=None, replace=False, weights=None, 
    random_state=None, axis=None, ignore_index=False
    )
```

In [7]:
df.sample(n = 5)

Unnamed: 0,ticker,Date,Open,High,Low,Close,Adj Close,Volume,source,Year,Close_lag_1
87629,DGX,2016-02-22,66.830002,67.739998,66.510002,66.699997,61.205196,819600.0,DGX.csv,2016,66.309998
289214,TNC,1975-10-07,0.0,3.375,3.125,3.125,0.37308,4000.0,TNC.csv,1975,3.0
32576,BRQS,2016-08-22,10.16,10.16,10.16,10.16,10.16,1000.0,BRQS.csv,2016,10.16
332977,WST,1992-09-16,5.46875,5.46875,5.46875,5.46875,1.265519,1200.0,WST.csv,1992,5.4375
297403,TNC,2008-03-19,39.700001,39.700001,37.060001,37.060001,30.684294,177400.0,TNC.csv,2008,39.310001


In [None]:
import random
random.seed(42)
sample_tickers = random.sample(df['ticker'].unique().tolist(), 30)
df = df[df['ticker'].isin(sample_tickers)]
simple_sample_dt = df.sample(frac = 0.1)
simple_sample_dt.shape, df.shape

Look at the distribution of tickers.

In [None]:
df['ticker'].value_counts().plot(kind='bar')

In [None]:
simple_sample_dt['ticker'].value_counts().plot(kind='bar')

## Stratified Sampling

+ Use `groupby()` and `.sample()` for stratified sampling.

In [None]:
strat_sample_dt = df.groupby(['ticker']).sample(frac = 0.1)
strat_sample_dt['ticker'].value_counts().plot(kind='bar')

# Sampling in Dask

+ Stratified sampling in `dask` can be achieved with `groupby().apply()` and a lambda function.

In [None]:
dd_dt = dd.read_parquet(ft_glob)
strat_sample_dd = (dd_dt
                      .groupby('ticker', group_keys=False)
                      .apply(lambda x: x.sample(frac = 0.1))
                      .compute()
                      .reset_index())
strat_sample_dd[strat_sample_dd['ticker'].isin(sample_tickers)]['ticker'].value_counts().plot(kind='bar')