# Data resampling  

> [https://github.com/BMClab/covid19](https://github.com/BMClab/covid19)  
> [Laboratory of Biomechanics and Motor Control](http://pesquisa.ufabc.edu.br/bmclab/)  
> Federal University of ABC, Brazil

<h1>Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Setup" data-toc-modified-id="Setup-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Setup</a></span><ul class="toc-item"><li><span><a href="#Environment" data-toc-modified-id="Environment-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Environment</a></span></li><li><span><a href="#Helping-functions" data-toc-modified-id="Helping-functions-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Helping functions</a></span></li></ul></li><li><span><a href="#Load-dataset" data-toc-modified-id="Load-dataset-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Load dataset</a></span><ul class="toc-item"><li><span><a href="#Checking-for-missing-values" data-toc-modified-id="Checking-for-missing-values-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Checking for missing values</a></span></li><li><span><a href="#Basic-information-about-the-dataset" data-toc-modified-id="Basic-information-about-the-dataset-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Basic information about the dataset</a></span></li></ul></li><li><span><a href="#Data-resampling" data-toc-modified-id="Data-resampling-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Data resampling</a></span><ul class="toc-item"><li><span><a href="#Verify-data" data-toc-modified-id="Verify-data-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Verify data</a></span></li><li><span><a href="#Export-data" data-toc-modified-id="Export-data-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Export data</a></span></li><li><span><a href="#Resample-by-year-and-at-different-periods" data-toc-modified-id="Resample-by-year-and-at-different-periods-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Resample by year and at different periods</a></span></li><li><span><a href="#Test-files" data-toc-modified-id="Test-files-3.4"><span class="toc-item-num">3.4&nbsp;&nbsp;</span>Test files</a></span></li></ul></li></ul></div>

## Setup

In [None]:
import sys, os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
%load_ext watermark  

%watermark
%watermark --iversions

### Environment

In [None]:
path2 = r'./../data/'

pd.set_option('display.float_format', lambda x: '%.4g' % x)

### Helping functions

In [None]:
def resample(y, cat1='athlete', freq='d', observed=True):
    """Resample numerical columns of `y` and repeat the categorical columns.
    
    y is a dataframe with a datetime index, a categorical column named cat1
    and possibly other categorical columns.
    The resampling period is given by the parameter freq and resample is
    performed at the datetime index and data are grouped by datetime and the
    categorical column given by the parameter cat1.
    Returns the resampled dataframe and the number of elements in each index
    level at each freq period.
    """
    # numerical columns
    cols_num = y.select_dtypes(include='number').columns.to_list()
    # categorical columns
    cols_cat = y.select_dtypes(include='category').columns.to_list()
    if len(cols_cat) > 1:
        y2 = y.drop_duplicates(subset=cat1)[cols_cat
                                           ].sort_values(cat1).reset_index(drop=True)
    cols_cat.remove(cat1)
    # change the dates of the last days to avoid a last week with less than 7 days
    nlastdays = 7 + y.index[-1].dayofyear % 7
    if freq.lower() == '7d' and nlastdays > 7:
        ts = pd.Timestamp(y.index[-1].date() - pd.to_timedelta(nlastdays-7, unit='D'))
        y.index = y.index.where(y.index <= ts, ts)
    t0 = pd.to_datetime({'year':[y.index[0].year], 'month':[1], 'day':[1]})[0]
    grouper = pd.Grouper(axis=0, freq=freq, sort=True, origin=t0)
    # resample only the numerical columns
    y = y.groupby([grouper, cat1], sort=True, observed=observed).sum().reset_index(level=1)
    #y.fillna(0, inplace=True)
    # calculate acumulated runs for numerical columns based on freq period   
    # just correct for differences in length of week, month or year
    if freq.lower() == 'd':
        pass
    elif freq.lower() == '7d':
        y[cols_num] = y[cols_num]
        # correct the divisor if the last week doesn't have 7 days
        y.loc[y.index[-1], cols_num] = y.loc[y.index[-1], cols_num] * (7 / nlastdays)
    elif freq.lower() == 'm':  # faster than using apply if not too many months
        for year in y.index.year.unique().astype(str):
            for month in y.loc[year].index.month.unique().astype(str):
                date = '{}-{}'.format(year, month)
                ndays = pd.Period(date).daysinmonth        
                y.loc[date, cols_num] = y.loc[date, cols_num] * (30 / ndays)
    elif freq.lower() == 'y':
        for year in y.index.year.unique().astype(str):
            date = year
            ndays = 366 if pd.Period(date).is_leap_year else 365   
            y.loc[date, cols_num] = y.loc[date, cols_num] * (365 / ndays)
    else:
        print('No averages are calculated. Only the sum is reported.')
    # number of elements in each index level at each freq period
    nidx0 = y.groupby(level=0, observed=True).size()
    nidx1 = y.groupby(cat1, observed=True).size()
    # add back the categorical columns
    if len(cols_cat):
        y = y.join(y2.set_index(cat1), on=cat1)
    
    return y, nidx0, nidx1

## Load dataset

In [None]:
df = pd.read_parquet(os.path.join(path2, 'run_ww_2019_2020.parquet'))
#df = df[['datetime', 'athlete', 'gender', 'age_group', 'distance', 'duration']]
df['athlete'] = df['athlete'].astype('category')  # bug in parquet
df

### Checking for missing values

In [None]:
n = 0
for col in df:
    null = df[df[col].isnull()]['athlete'].unique().tolist()
    if null:
        print('Athlete: {}, null value in {}'.format(null, col))
        n = 1
if n == 0:
    print('No missing values found.')

### Basic information about the dataset

In [None]:
df.info(memory_usage='deep')

In [None]:
nday = df['datetime'].dt.date.value_counts().size
print('Number of days:', nday)
nathlete = df['athlete'].unique().size
print('Number of athletes:', nathlete)
nactivity = df.shape[0]
print('Number of running activities:', nactivity)

## Data resampling

Resample data using a custom function to speedup the process.  
This process consumes at peak about 10 GB of RAM memory. The process could be divided by year, but because we want to generate all possible categories for days and athletes, we first would have to fill each year with all  athletes.

In [None]:
df.set_index('datetime', inplace=True)
df, nathletes, nruns = resample(df, cat1='athlete', freq='d', observed=False)
df.reset_index(inplace=True)

### Verify data

We can see that the Cartesian product of days and athletes was performed and the distance and duration columns were filled with zeros when there was no register for an athlete on a given day:

In [None]:
df

In [None]:
nathlete * nday

In [None]:
nathletes

In [None]:
df.drop_duplicates(subset='athlete')[['athlete', 'age_group', 'gender']
                                     ].groupby(['age_group', 'gender']
                                              ).count().unstack(level=0)

In [None]:
df.groupby([df['datetime'].dt.year]).describe()

In [None]:
df.groupby([df['datetime'].dt.year, 'age_group', 'gender']).describe()

### Export data

See its [docs](https://arrow.apache.org/docs/python/feather.html) and a comparison on [formats to save Pandas data](https://towardsdatascience.com/the-best-format-to-save-pandas-data-414dca023e0d).

In [None]:
df.to_parquet(os.path.join(path2, 'run_ww_2019_2020_d.parquet'))

### Resample by year and at different periods

Now that we have a DataFrame with all possible categories for athletes in all days (from the Cartesian product between days and athletes, setting the parameter `observed` to `False` in the `resample` function), we can resample the dataset separately by year and at different periods.

In [None]:
years = ['2019', '2020']
freqs = ['d', '7d', 'm']
df.set_index('datetime', inplace=True)
for year in tqdm(years, desc='Year'):
    for freq in tqdm(freqs, desc='Freq'):
        dfi = resample(df.loc[year], cat1='athlete', freq=freq, observed=True)[0]
        dfi.reset_index(inplace=True)
        if freq == '7d': freq = 'w'
        dfi.to_parquet(os.path.join(path2, 'run_ww_{}_{}.parquet'.format(year, freq)))

### Test files

In [None]:
for year in years:
    for freq in freqs:
        if freq == '7d': freq = 'w'
        df = pd.read_parquet(os.path.join(path2, 'run_ww_{}_{}.parquet'.format(year, freq)))
        df['athlete'] = df['athlete'].astype('category')  # bug in parquet
        #df.set_index('datetime', inplace=True)
        print('\nFile: run_ww_{}_{}.parquet'.format(year, freq))
        display(df.drop_duplicates(subset='athlete')[['athlete', 'age_group', 'gender']
                                                    ].groupby(['age_group', 'gender']
                                                             ).count().unstack(level=0))
        display(df.groupby([df['datetime'].dt.year, 'age_group', 'gender']).describe())