In the following notebook, we are performing the statistical summarization on the original SWAN-SF dataset.

We are calculating 5 stats - Median, Standard Deviation, Skewness, Kurtosis, and the Last Value of every feature at every instance. 

The shape of our original was - (69189, 24, 60)
After the statistical summarization - (69189, 120) # 24*5 = 120

# Imports

In [1]:
import pandas as pd
import numpy as np
import scipy
from scipy.stats import skew
from scipy.stats import kurtosis

# Data loading

Importing the original data

In [5]:
p1_data = pd.read_pickle(r'../og_data/partition1_data.pkl')
p2_data = pd.read_pickle(r'../og_data/partition2_data.pkl')
p3_data = pd.read_pickle(r'../og_data/partition3_data.pkl')
p4_data = pd.read_pickle(r'../og_data/partition4_data.pkl')
p5_data = pd.read_pickle(r'../og_data/partition5_data.pkl')
p1_labels = pd.read_pickle(r'../og_data/partition1_labels.pkl')
p2_labels = pd.read_pickle(r'../og_data/partition2_labels.pkl')
p3_labels = pd.read_pickle(r'../og_data/partition3_labels.pkl')
p4_labels = pd.read_pickle(r'../og_data/partition4_labels.pkl')
p5_labels = pd.read_pickle(r'../og_data/partition5_labels.pkl')

# Imp Functions

Setting the column name for our old data and new data

In [109]:
og_columns = ['TOTUSJH','TOTBSQ','TOTPOT','TOTUSJZ','ABSNJZH','SAVNCPP','USFLUX','TOTFZ','MEANPOT','EPSZ',
              'MEANSHR','SHRGT45','MEANGAM','MEANGBT','MEANGBZ','MEANGBH','MEANJZH','TOTFY','MEANJZD',
              'MEANALP','TOTFX','EPSY','EPSX','R_VALUE']
new_columns = ['TOTUSJH_med','TOTUSJH_sd','TOTUSJH_sk','TOTUSJH_kt','TOTUSJH_lv',
              'TOTBSQ_med','TOTBSQ_sd','TOTBSQ_sk','TOTBSQ_kt','TOTBSQ_lv',
              'TOTPOT_med','TOTPOT_sd','TOTPOT_sk','TOTPOT_kt','TOTPOT_lv',
              'TOTUSJZ_med','TOTUSJZ_sd','TOTUSJZ_sk','TOTUSJZ_kt','TOTUSJZ_lv',
              'ABSNJZH_med','ABSNJZH_sd','ABSNJZH_sk','ABSNJZH_kt','ABSNJZH_lv',
              'SAVNCPP_med','SAVNCPP_sd','SAVNCPP_sk','SAVNCPP_kt','SAVNCPP_lv',
              'USFLUX_med','USFLUX_sd','USFLUX_sk','USFLUX_kt','USFLUX_lv',
              'TOTFZ_med','TOTFZ_sd','TOTFZ_sk','TOTFZ_kt','TOTFZ_lv',
              'MEANPOT_med','MEANPOT_sd','MEANPOT_sk','MEANPOT_kt','MEANPOT_lv',
              'EPSZ_med','EPSZ_sd','EPSZ_sk','EPSZ_kt','EPSZ_lv',
              'MEANSHR_med','MEANSHR_sd','MEANSHR_sk','MEANSHR_kt','MEANSHR_lv',
              'SHRGT45_med','SHRGT45_sd','SHRGT45_sk','SHRGT45_kt','SHRGT45_lv',
              'MEANGAM_med','MEANGAM_sd','MEANGAM_sk','MEANGAM_kt','MEANGAM_lv',
              'MEANGBT_med','MEANGBT_sd','MEANGBT_sk','MEANGBT_kt','MEANGBT_lv',
              'MEANGBZ_med','MEANGBZ_sd','MEANGBZ_sk','MEANGBZ_kt','MEANGBZ_lv',
              'MEANGBH_med','MEANGBH_sd','MEANGBH_sk','MEANGBH_kt','MEANGBH_lv',
              'MEANJZH_med','MEANJZH_sd','MEANJZH_sk','MEANJZH_kt','MEANJZH_lv',
              'TOTFY_med','TOTFY_sd','TOTFY_sk','TOTFY_kt','TOTFY_lv',
              'MEANJZD_med','MEANJZD_sd','MEANJZD_sk','MEANJZD_kt','MEANJZD_lv',
              'MEANALP_med','MEANALP_sd','MEANALP_sk','MEANALP_kt','MEANALP_lv',
              'TOTFX_med','TOTFX_sd','TOTFX_sk','TOTFX_kt','TOTFX_lv',
              'EPSY_med','EPSY_sd','EPSY_sk','EPSY_kt','EPSY_lv',
              'EPSX_med','EPSX_sd','EPSX_sk','EPSX_kt','EPSX_lv',
              'R_VALUE_med','R_VALUE_sd','R_VALUE_sk','R_VALUE_kt','R_VALUE_lv']

The calculate_descriptive_features function will take the dataframe as an input and will return the same datatype. 

The function will take one instance at a time and caluclate the descriptive features of each column.

In [110]:
def calculate_descriptive_features(data:DataFrame)-> DataFrame: #Finished!
    variates_to_calc_on = og_columns
    features_to_return = new_columns
    
    # Create empty data frame for return with named columns 
    df = pd.DataFrame(columns=features_to_return)

    
    # For each element append to temp list
    list2add = []
    for d in variates_to_calc_on:
        l = data[d].to_numpy()
        median = np.median(l)
        last_value = data[d].iat[-1]
        std = np.std(l)
        sk = skew(l)
        kt = kurtosis(l)
        list2add.append(median)
        list2add.append(std)
        list2add.append(sk)
        list2add.append(kt)
        list2add.append(last_value)
        continue
    
    df.loc[len(df)] = list2add
    return list2add

The feature_extract function will then iterate over the original data and give one instance at a time input to the above described function. 

Finally, the feature_extract function will append the data to new dataframe.

In [111]:
def feature_extract(data):
    
    data_new = pd.DataFrame(columns = new_columns)
    
    for i in data:
        df = pd.DataFrame(i)
        temp = df.T
        temp.columns = og_columns
        to_append = calculate_descriptive_features(temp)
        df_length = len(data_new)
        data_new.loc[df_length] = to_append
    
    return data_new

# Feature Extraction

### (Median, Standard Deviation, Skewness, Kurtosis)

Calling the function for conversion

In [114]:
p1_data_new = feature_extract(p1_data)
p2_data_new = feature_extract(p2_data)
p3_data_new = feature_extract(p3_data)
p4_data_new = feature_extract(p4_data)
p5_data_new = feature_extract(p5_data)

The resultant data

In [116]:
p1_data_new

Unnamed: 0,TOTUSJH_med,TOTUSJH_sd,TOTUSJH_sk,TOTUSJH_kt,TOTUSJH_lv,TOTBSQ_med,TOTBSQ_sd,TOTBSQ_sk,TOTBSQ_kt,TOTBSQ_lv,...,EPSX_med,EPSX_sd,EPSX_sk,EPSX_kt,EPSX_lv,R_VALUE_med,R_VALUE_sd,R_VALUE_sk,R_VALUE_kt,R_VALUE_lv
0,852.387481,38.705732,0.101228,-1.151326,797.305605,1.287150e+10,6.600197e+08,0.605050,-0.306922,1.195832e+10,...,-0.117116,0.004569,-0.505426,-1.004721,-0.112960,3.817053,0.098284,-0.644924,-0.360584,3.622207
1,304.475665,152.388240,0.570083,-0.734221,684.198533,2.706526e+09,1.424560e+09,0.286870,-1.218484,5.638643e+09,...,0.197573,0.023208,0.612296,-0.782383,0.168056,3.533591,0.420787,0.056153,-1.218021,4.313935
2,1236.752144,58.471058,-0.550508,-1.187910,1237.576266,1.655824e+10,3.105103e+08,0.354933,-1.190335,1.718907e+10,...,-0.061451,0.009153,0.134660,-1.252731,-0.045646,3.912274,0.216614,-1.023225,-0.294845,3.497055
3,2566.137148,55.138312,0.650397,0.099714,2480.956293,4.379429e+10,4.283589e+08,0.075063,0.227841,4.268318e+10,...,0.081326,0.002548,-0.090068,-1.040635,0.079864,5.066600,0.023924,0.465855,-0.209067,5.098143
4,3099.997336,78.171713,0.022459,-1.341485,3125.185889,3.897834e+10,8.259602e+08,-0.066008,-1.562532,3.942829e+10,...,0.045396,0.003987,-0.872734,-0.374312,0.048454,4.845227,0.034341,-0.580117,-0.149470,4.821198
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69184,299.856515,18.332085,2.934358,17.227184,265.180988,2.640377e+09,9.625033e+07,0.546508,0.525637,2.534755e+09,...,-0.246439,0.007235,0.945660,-0.226413,-0.249594,0.000000,0.000000,0.000000,-3.000000,0.000000
69185,113.043312,4.763908,0.665179,0.554269,115.289071,1.003824e+09,3.968193e+07,-0.250499,-0.987483,9.463884e+08,...,0.189659,0.011450,0.446968,-0.458830,0.192108,0.000000,0.757184,2.157446,2.654901,0.000000
69186,122.127288,3.877124,0.097178,-0.873601,122.471094,1.044857e+09,7.103432e+07,0.707642,-1.022389,1.205324e+09,...,0.284261,0.004519,0.488905,-0.684254,0.284305,0.000000,0.260405,7.550957,55.016949,0.000000
69187,9.177392,2.393275,0.513643,-0.912093,6.498207,6.042786e+07,2.007428e+07,0.381102,-1.023197,3.544967e+07,...,0.234660,0.043653,0.217991,-0.773508,0.282376,0.000000,0.000000,0.000000,-3.000000,0.000000


Saving the data to csv file

In [118]:
p1_data_new.to_csv('p1_data_new.csv')
p2_data_new.to_csv('p2_data_new.csv')
p3_data_new.to_csv('p3_data_new.csv')
p4_data_new.to_csv('p4_data_new.csv')
p5_data_new.to_csv('p5_data_new.csv')

Reference - https://github.com/Mroussell/swan_sf