In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import enum
import os

import sys
# sys.path.append( '/content/drive/My Drive/thesis-workspace' ) # drive version
sys.path.append( '../src/' ) # local version

from ml.preprocessing import *

# Classification

## Loading Datasets

In [None]:
class DF( enum.Enum ):
    _10SEC = 0
    _10SEC_AVG = 1
    _30SEC = 2
    _30SEC_AVG = 3
    _1MIN = 4
    _1MIN_AVG = 5

# path = [ '.', 'drive', 'My Drive', 'thesis-workspace', 'datasets' ]
path = [ '..', 'datasets' ]

df = {}
df[ DF._10SEC ] = pd.read_csv( os.path.join( *path, 'ds-residential-10sec.csv' ) , index_col = 'date', parse_dates = [ 'date' ] )
df[ DF._10SEC_AVG ] = pd.read_csv( os.path.join( *path, 'ds-residential-10sec-avg.csv' ), index_col = 'date', parse_dates = [ 'date' ] )
df[ DF._30SEC ] = pd.read_csv( os.path.join( *path, 'ds-residential-30sec-avg.csv' ), index_col = 'date', parse_dates = [ 'date' ] )
df[ DF._30SEC_AVG ] = pd.read_csv( os.path.join( *path, 'ds-residential-30sec-avg.csv' ), index_col = 'date', parse_dates = [ 'date' ] )
df[ DF._1MIN ] = pd.read_csv( os.path.join( *path, 'ds-residential-1min-avg.csv' ), index_col = 'date', parse_dates = [ 'date' ] )
df[ DF._1MIN_AVG ] = pd.read_csv( os.path.join( *path, 'ds-residential-1min-avg.csv' ), index_col = 'date', parse_dates = [ 'date' ] )

## Data splitting, standarization and balancing

In [None]:
from imblearn.over_sampling import ADASYN
from collections import Counter


# Number of neighbors to use for balancing each dataset

neighbors = { 
    DF._10SEC: 1,
    DF._10SEC_AVG: 1,
    DF._30SEC: 1,
    DF._30SEC_AVG: 1,
    DF._1MIN: 1,
    DF._1MIN_AVG: 1
}

for k, d in df.items() :  
    temp = list( split_data( d, test_size = 0.20 ) )
    temp[ X_TRAIN ], temp[ X_TEST ] = standardize( temp[ X_TRAIN ], temp[ X_TEST ] )
    
    print( 'Before balancing:', k, Counter( temp[ Y_TRAIN ] ) )
    temp[ X_TRAIN ], temp[ Y_TRAIN ] = balance_df( 
        temp[ X_TRAIN ], 
        temp[ Y_TRAIN ], 
        neighbors[ k ] )
    print( 'After balancing:', k, Counter( temp[ Y_TRAIN ] ) )
    print()
    
    df[ k ] = temp

## Training

In [None]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.svm import SVC
from ml.classification import *

params_svc = [ { 
      'C': [ 1, 10, 100 ], # regularization parameter
      'kernel': [ 'rbf' ], # removed linear kernels
      'gamma': [ 1, 10 ] # how curved the separation is
  } ]  

grids = { 
    DF._10SEC: None,
    DF._10SEC_AVG: None,
    DF._30SEC: None,
    DF._30SEC_AVG: None,
    DF._1MIN: None,
    DF._1MIN_AVG: None
}

### Regular

In [None]:
%%time
for k, d in df.items():
    if( k != DF._10SEC and k != DF._10SEC_AVG and k != DF._30SEC and k != DF._30SEC_AVG and k != DF._1MIN ):
        print( 'Dataset ', k, len( d ) )
        grids[ k ] = train_and_test( 
                SVC( probability = True ), params_svc,  
                d[ X_TRAIN ], d[ Y_TRAIN ], 
                d[ X_TEST ], d[ Y_TEST ], plot_cmatrix = True )

# Evaluation

In [None]:
from ml.evaluation import *

## Learning Curves

In [None]:
%%time
temp = [ item for item in df.items() if item[ 0 ] not in [ DF._10SEC, DF._10SEC_AVG, DF._30SEC, DF._30SEC_AVG, DF._1MIN ] ]
plot_learning_curves( temp, grids, 'SVM' )