In [105]:
# Imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from itertools import combinations
from tqdm.notebook import tqdm

In [106]:
# Datafile
df = pd.read_csv("./data_table_NA850_nonfiltered_updated.csv")
df.head()

Unnamed: 0,AirTemp,QV,Omega,SeaLevPress,UWinds,VWinds,LH,Time,Lat,Lon
0,7.060846,7.267001,0.000642,1021.565625,-11.894431,-5.276164,12.88808,2000010100,25.064459,-35.368896
1,7.100183,7.474899,-0.000296,1021.084141,-11.230261,-3.172798,-10.21712,2000010103,24.480576,-36.771637
2,7.068231,7.335827,0.000323,1020.302969,-10.246526,-1.993664,0.511768,2000010106,24.649042,-37.882874
3,7.328638,7.064822,0.000828,1020.585156,-9.889462,-1.943067,-16.420087,2000010109,24.122194,-38.753021
4,7.598169,7.333828,0.000373,1022.448281,-10.461138,-1.204948,-6.642615,2000010112,24.368916,-40.022919


In [107]:
desc_corr = abs(df.corr('kendall'))[['LH']].sort_values(by='LH', ascending=False)
desc_corr

Unnamed: 0,LH
LH,1.0
Omega,0.455173
QV,0.406391
Lon,0.170593
SeaLevPress,0.140413
Lat,0.131128
AirTemp,0.121293
Time,0.116621
UWinds,0.041764
VWinds,0.007098


In [108]:
LQ = df.LH.quantile(0.25)
UQ = df.LH.quantile(0.75)
bound = 1.5 * (UQ - LQ) #Whisker length * IQR
lower_bound = LQ - bound
upper_bound = UQ + bound
df["Extreme"] = np.where( (df["LH"] > upper_bound) | (df["LH"]  < lower_bound), 1, 0)

In [109]:
data_Y = df["Extreme"]
data_X = df.drop(["Extreme", "LH"], axis=1)

In [110]:
data_X

Unnamed: 0,AirTemp,QV,Omega,SeaLevPress,UWinds,VWinds,Time,Lat,Lon
0,7.060846,7.267001,0.000642,1021.565625,-11.894431,-5.276164,2000010100,25.064459,-35.368896
1,7.100183,7.474899,-0.000296,1021.084141,-11.230261,-3.172798,2000010103,24.480576,-36.771637
2,7.068231,7.335827,0.000323,1020.302969,-10.246526,-1.993664,2000010106,24.649042,-37.882874
3,7.328638,7.064822,0.000828,1020.585156,-9.889462,-1.943067,2000010109,24.122194,-38.753021
4,7.598169,7.333828,0.000373,1022.448281,-10.461138,-1.204948,2000010112,24.368916,-40.022919
...,...,...,...,...,...,...,...,...,...
1727,15.948999,11.653900,-0.000827,1013.982500,-11.293950,-0.528353,2000121718,16.255468,-65.481140
1728,16.130365,11.756897,-0.001596,1013.459063,-9.442187,-0.190940,2000121721,15.835272,-66.934265
1729,16.080194,12.042999,-0.001654,1015.073047,-9.237350,-0.138638,2000121800,15.924951,-67.460663
1730,15.767419,10.295868,-0.000208,1015.828281,-10.230459,-0.086515,2000121803,16.759212,-67.945740


In [111]:
res_combined = []
for index in range(2, len(desc_corr)):
    X_train, X_test, y_train, y_test = train_test_split(data_X[desc_corr.index[1:index]], data_Y, test_size=0.25, random_state=738)
    dt = DecisionTreeClassifier()
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_test)
    res_combined.append(f1_score(y_test, y_pred))
res_combined

[0.4210526315789474,
 0.509090909090909,
 0.4695652173913044,
 0.5384615384615384,
 0.5384615384615384,
 0.5132743362831858,
 0.5641025641025641,
 0.5818181818181819]

In [112]:
combs = []
for x in range(2, len(desc_corr)):
    combs += [list(y) for y in combinations(desc_corr.index[1:], x)]

In [113]:
# Keys will be index from combs list, Values will be F1 score
res_comb = []
for index in range(len(combs)):
    X_train, X_test, y_train, y_test = train_test_split(data_X[combs[index]], data_Y, test_size=0.25, random_state=738)
    dt = DecisionTreeClassifier()
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_test)
    res_comb.append(f1_score(y_test, y_pred))

In [114]:
best_index = res_comb.index(max(res_comb))
print("Best F1 score of %f using\n%s" % (res_comb[best_index], str(combs[best_index])))

Best F1 score of 0.609524 using
['Omega', 'Lon', 'Lat', 'AirTemp', 'Time']


In [115]:
# Keys will be index from combs list, Values will be F1 score
res_comb = []
for index in range(len(combs)):
    X_train, X_test, y_train, y_test = train_test_split(data_X[combs[index]], data_Y, test_size=0.25, random_state=738)
    dt = DecisionTreeClassifier(max_depth=10)
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_test)
    res_comb.append(f1_score(y_test, y_pred))

In [116]:
best_index = res_comb.index(max(res_comb))
print("Best F1 score of %f using\n%s" % (res_comb[best_index], str(combs[best_index])))

Best F1 score of 0.631579 using
['Omega', 'Lon', 'SeaLevPress', 'AirTemp', 'Time', 'UWinds']


In [117]:
# Keys will be index from combs list, Values will be F1 score
res_comb = []
for index in range(len(combs)):
    X_train, X_test, y_train, y_test = train_test_split(data_X[combs[index]], data_Y, test_size=0.25, random_state=738)
    dt = DecisionTreeClassifier(criterion="entropy")
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_test)
    res_comb.append(f1_score(y_test, y_pred))

In [118]:
best_index = res_comb.index(max(res_comb))
print("Best F1 score of %f using\n%s" % (res_comb[best_index], str(combs[best_index])))

Best F1 score of 0.646465 using
['Omega', 'Lon', 'SeaLevPress', 'Lat', 'AirTemp']


In [123]:
# Keys will be index from combs list, Values will be F1 score
res_comb = []
for index in range(len(combs)):
    X_train, X_test, y_train, y_test = train_test_split(data_X[combs[index]], data_Y, test_size=0.25, random_state=738)
    dt = DecisionTreeClassifier(criterion="entropy", max_depth=9)
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_test)
    res_comb.append(f1_score(y_test, y_pred))

In [124]:
best_index = res_comb.index(max(res_comb))
print("Best F1 score of %f using\n%s" % (res_comb[best_index], str(combs[best_index])))

Best F1 score of 0.645833 using
['Omega', 'QV', 'Lon', 'SeaLevPress', 'Lat', 'AirTemp', 'UWinds', 'VWinds']
