In [3]:
# external imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, f1_score, accuracy_score
from sklearn import svm
from imblearn.under_sampling import RandomUnderSampler
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter


# internal imports
import sys
sys.path.insert(0,'../')
from data_utils import add_deltas_and_time, scale_data, column_combinations, add_extreme, add_lifetime

In [5]:
data_850 = pd.read_csv('./NA850data_2000-2014.csv')
data_w_extreme = add_extreme(data_850)
data_w_time_deltas = add_deltas_and_time(data_w_extreme)
data_w_lifetime = add_lifetime(data_w_time_deltas)

In [6]:
# get scaled provided data with deltas and time added
X = data_w_lifetime.drop(columns=['LH', 'dLH', "Extreme", "dExtreme"])
X = scale_data(X)

y = data_w_lifetime['LH']
y_extreme = data_w_lifetime['Extreme']

X.head()

Unnamed: 0_level_0,AirTemp,QV,Omega,SeaLevPress,UWinds,VWinds,Lat,Lon,PtIndex,WaveTrajectory,dAirTemp,dQV,dOmega,dSeaLevPress,dUWinds,dVWinds,dLat,dLon,LifeTime
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2000-01-01 03:00:00,-6.088163,-0.910811,0.080339,2.180997,-0.71867,-1.008888,1.243533,0.004734,-0.954545,-1.168067,0.091148,0.195511,-0.799627,-0.251402,0.394266,1.285404,-1.218041,-1.279085,-1.5
2000-01-01 06:00:00,-6.107434,-0.943091,0.476626,1.96644,-0.527986,-0.698457,1.26411,-0.049866,-0.909091,-1.168067,-0.03544,-0.160021,0.530617,-0.388836,0.580967,0.711946,0.271371,-0.790392,-1.5
2000-01-01 09:00:00,-5.950378,-1.005994,0.800095,2.043946,-0.458774,-0.685136,1.199761,-0.09262,-0.863636,-1.168067,0.483702,-0.295208,0.433411,0.098812,0.214843,0.011722,-1.105129,-0.386217,-1.5
2000-01-01 12:00:00,-5.787819,-0.943555,0.508682,2.555672,-0.569586,-0.490811,1.229895,-0.155015,-0.818182,-1.168067,0.499905,0.258125,-0.387412,0.823818,-0.327761,0.438309,0.42629,-1.05638,-1.25
2000-01-01 15:00:00,-5.531023,-0.874945,0.388518,2.123726,-0.611146,-0.45227,1.246959,-0.213574,-0.772727,-1.168067,0.777359,0.28537,-0.158807,-0.751805,-0.119031,0.07116,0.214437,-0.925509,-1.25


In [7]:
undersampler = RandomUnderSampler(sampling_strategy=1, random_state=5)
X_under, y_extreme_under = undersampler.fit_resample(X, y_extreme)
y_under = y[undersampler.sample_indices_]

In [8]:
LQ = data_850.LH.quantile(0.25)
UQ = data_850.LH.quantile(0.75)
bound = 1.5 * (UQ - LQ)  # Whisker length * IQR
lower_bound = LQ - bound
upper_bound = UQ + bound

def classify_extremes(y_pred):
    return np.where((y_pred > upper_bound) | (y_pred < lower_bound), 1, 0)

In [9]:
preselected_features = ["AirTemp", "QV", "Omega", "VWinds"]
features_to_select = [feature for feature in X.columns if feature not in preselected_features]

features_to_select

['SeaLevPress',
 'UWinds',
 'Lat',
 'Lon',
 'PtIndex',
 'WaveTrajectory',
 'dAirTemp',
 'dQV',
 'dOmega',
 'dSeaLevPress',
 'dUWinds',
 'dVWinds',
 'dLat',
 'dLon',
 'LifeTime']

In [42]:
clf_rbf = svm.SVC(kernel = 'rbf', C = 10)

X_train, X_test, y_train, y_test, y_ex_train, y_ex_test = \
    train_test_split(X_under, y_under, y_extreme_under, test_size=0.2, random_state=2)

# initialize metric list
log_csv = []

# test over all combinations of a maximum length
for comb in tqdm(column_combinations(features_to_select, max_len=5), desc=f"Training and Testing..."):
    # train and test inputs
    features_selected = preselected_features + comb
    X_train_feat = X_train[features_selected]
    X_test_feat = X_test[features_selected]

    # fit and predict train and test
    clf_rbf.fit(X_train_feat, y_ex_train)
    y_pred_train = clf_rbf.predict(X_train_feat)
    y_pred_test = clf_rbf.predict(X_test_feat)

    # log training and testing metrics to list
    log_csv.append(
        [features_selected,
        mean_squared_error(y_train, y_pred_train),  # train MSE
        mean_squared_error(y_test, y_pred_test),  # test MSE
        r2_score(y_train, y_pred_train),  # train r^2
        r2_score(y_test, y_pred_test),  # test r^2
        accuracy_score(y_ex_train, y_pred_ex_train),  # train accuracy
        accuracy_score(y_ex_test, y_pred_ex_test),  # test accuracy
        f1_score(y_ex_train, y_pred_train),  # train f1
        f1_score(y_ex_test, y_pred_test)]  # test f1
    )

Training and Testing...:   0%|          | 0/4943 [00:00<?, ?it/s]

In [43]:
log_csv_sorted = sorted(log_csv, key=lambda l: l[-1], reverse=True)  # sort by max testing f1
# convert to dataframe and write to file
log_df = pd.DataFrame(log_csv_sorted, columns=["feature_combination", "train_MSE",
                                               "test_MSE", "train_r^2", "test_r^2", "train_acc",
                                               "test_acc", "train_F1", "test_F1"])
log_df.to_csv("SVM_results.csv", sep="|", index=False)

## Different Cost Values

In [58]:
log_cost_csv = []

for entry in tqdm(log_csv_sorted[:10], desc="Training and Testing..."):
    feats = entry[0]
    
    for cost in [x**2 for x in np.arange(1, 11)]:
        clf_rbf = svm.SVC(kernel = 'rbf', C = cost)
        
        # train and test inputs
        X_train_feat = X_train[feats]
        X_test_feat = X_test[feats]

        # fit and predict train and test
        clf_rbf.fit(X_train_feat, y_ex_train)
        y_pred_train = clf_rbf.predict(X_train_feat)
        y_pred_test = clf_rbf.predict(X_test_feat)

        # log training and testing metrics to list
        log_cost_csv.append(
            [cost, feats,
            mean_squared_error(y_train, y_pred_train),  # train MSE
            mean_squared_error(y_test, y_pred_test),  # test MSE
            r2_score(y_train, y_pred_train),  # train r^2
            r2_score(y_test, y_pred_test),  # test r^2
            accuracy_score(y_ex_train, y_pred_ex_train),  # train accuracy
            accuracy_score(y_ex_test, y_pred_ex_test),  # test accuracy
            f1_score(y_ex_train, y_pred_train),  # train f1
            f1_score(y_ex_test, y_pred_test)]  # test f1
        )

Training and Testing...:   0%|          | 0/10 [00:00<?, ?it/s]

In [59]:
log_cost_csv_sorted = sorted(log_cost_csv, key=lambda l: l[-1], reverse=True)  # sort by max testing f1
# convert to dataframe and write to file
log_cost_df = pd.DataFrame(log_cost_csv_sorted, columns=["cost", "feature_combination", "train_MSE",
                                               "test_MSE", "train_r^2", "test_r^2", "train_acc",
                                               "test_acc", "train_F1", "test_F1"])
log_cost_df.to_csv("SVM_cost_results.csv", sep="|", index=False)

In [60]:
log_cost_df[:10]

Unnamed: 0,cost,feature_combination,train_MSE,test_MSE,train_r^2,test_r^2,train_acc,test_acc,train_F1,test_F1
0,9,"[AirTemp, QV, Omega, VWinds, UWinds, Lon, dAir...",1095.601421,1246.658146,-0.467555,-0.472255,0.507843,0.468652,0.886648,0.914373
1,25,"[AirTemp, QV, Omega, VWinds, UWinds, Lat, Lon,...",1095.33555,1246.760833,-0.467199,-0.472376,0.507843,0.468652,0.901002,0.913505
2,9,"[AirTemp, QV, Omega, VWinds, UWinds, Lat, Lon,...",1095.499462,1246.505444,-0.467418,-0.472075,0.507843,0.468652,0.88764,0.913242
3,16,"[AirTemp, QV, Omega, VWinds, UWinds, Lat, Lon,...",1095.40928,1246.836508,-0.467297,-0.472466,0.507843,0.468652,0.899799,0.913242
4,36,"[AirTemp, QV, Omega, VWinds, UWinds, Lat, Lon,...",1095.344062,1246.637609,-0.46721,-0.472231,0.507843,0.468652,0.906325,0.912121
5,9,"[AirTemp, QV, Omega, VWinds, UWinds, Lon, dAir...",1095.551309,1246.482779,-0.467488,-0.472048,0.507843,0.468652,0.885233,0.91047
6,9,"[AirTemp, QV, Omega, VWinds, UWinds, Lat, Lon,...",1095.486648,1246.519852,-0.467401,-0.472092,0.507843,0.468652,0.887997,0.91047
7,9,"[AirTemp, QV, Omega, VWinds, UWinds, Lat, Lon,...",1095.685539,1246.876824,-0.467667,-0.472513,0.507843,0.468652,0.893393,0.909648
8,9,"[AirTemp, QV, Omega, VWinds, UWinds, Lon, PtIn...",1095.501243,1246.402874,-0.46742,-0.471954,0.507843,0.468652,0.885954,0.909366
9,16,"[AirTemp, QV, Omega, VWinds, UWinds, Lon, dAir...",1095.450704,1246.475058,-0.467353,-0.472039,0.507843,0.468652,0.889067,0.909091
