# Import libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import sklearn
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import math
import pickle

import warnings
warnings.filterwarnings("ignore")

# Train-Test
from sklearn.model_selection import train_test_split

# Normalization
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer

# Feature selection
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE

# Classification models
from sklearn.dummy import DummyClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression

from sklearn import metrics

## Load data from pickle file

In [2]:
df_train_processed = pd.read_pickle('./data/df_train_processed.pkl')

open_file = open('./data/param_dict.pkl', "rb")
param_dict = pickle.load(open_file)
open_file.close()

df_test_processed = pd.read_pickle('./data/df_test_processed.pkl')

In [3]:
# X_train = df_train_processed.drop(['RainTomorrow','RainfallTomorrow'],axis=1)
# y_train = df_train_processed.RainTomorrow

# X_test = df_test_processed.drop(['RainTomorrow','RainfallTomorrow'],axis=1)
# y_test = df_test_processed.RainTomorrow

# Split into train/cv
X_train, X_cv, y_train, y_cv = train_test_split(
    df_train_processed.drop(['RainTomorrow','RainfallTomorrow'], axis=1),
    df_train_processed['RainTomorrow'],
    test_size=0.1,
    random_state=0)

In [4]:
def print_results(y,pred):
    print("accuracy = "+str(metrics.accuracy_score(y, pred)))
    print("precision = "+str(metrics.precision_score(y, pred)))
    print("recall = "+str(metrics.recall_score(y, pred)))
    print("f1_score = "+str(metrics.f1_score(y, pred)))
    print("\nconfusion matrix:")
    print(metrics.confusion_matrix(y, pred))
    # print(metrics.classification_report(y_cv, predictions))

## Feature selection: PCA

In [5]:
explained_variance = .95
pca = PCA(n_components=explained_variance).fit(X_train)

X_train_pca = pca.transform(X_train)
X_cv_pca = pca.transform(X_cv)

# pca = PCA(n_components=explained_variance).fit(df_train_processed)
# df_train_pca = pca.transform(df_train_processed)

print("Number of components required to explain "+str(explained_variance)+"% of the variance = "+str(X_train_pca.shape[1]))

Number of components required to explain 0.95% of the variance = 15


## Feature selection: RFE (with simple Logistic Regression)

In [6]:
logisticRegr = LogisticRegression(class_weight='balanced')

rfe = RFE(estimator=logisticRegr, step=1, verbose=0, n_features_to_select=15)
rfe = rfe.fit(X_train, y_train.values.ravel())

total_cols = np.array(X_train.columns.values.tolist())
selected_cols = total_cols[rfe.support_].tolist()
X_train_rfe = X_train[selected_cols]
X_cv_rfe =  X_cv[selected_cols]
print("Columns selected: "+str(selected_cols))

Columns selected: ['Rainfall', 'WindGustSpeed', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'RainToday', 'LocationType_0', 'LocationType_2', 'LocationType_3', 'LocationType_4', 'PressureMean', 'TempMaxDiff', 'imputed_mean', 'WindDir9am_sin', 'WindDir3pm_sin']


In [11]:
def testModel(clf_,grid_values_,X_train_,y_train_,X_cv_,y_cv_,scoring_='f1',verbose_=0):
    # model.get_params() # Return selected params
    # cv=None -> None, to use the default 5-fold cross validation (K-Fold, k=5)
    model_ = GridSearchCV(clf_, param_grid = grid_values_, cv=None, scoring=scoring_,verbose=verbose_)
    model_.fit(X_train_, y_train_)

    y_pred = model_.predict(X_cv_)
    metrics_={    
        "accuracy":         metrics.accuracy_score(y_cv_, y_pred),
        "precision":        metrics.precision_score(y_cv_, y_pred),
        "recall":           metrics.recall_score(y_cv_, y_pred),
        "f1_score":         metrics.f1_score(y_cv_, y_pred),
        "confusion_matrix": metrics.confusion_matrix(y_cv_, y_pred)
    }
    return model_, metrics_

def train_models(X_train_, y_train_, X_cv_, y_cv_, features_selection='',class_weight='balanced',add_dummy=False):
    model_list=[]

    # Dummy Classifier
    if add_dummy:
        clf = DummyClassifier(strategy= 'most_frequent')
        grid_values={}
        model, model_metrics = testModel(clf,grid_values,X_train_,y_train_,X_cv_,y_cv_)
        model_list.append(
            {'name': 'DummyClassifier',
            'model': model,
            'features_selection': features_selection,
            'class_weight': class_weight,
            'metrics': model_metrics
            }
        )

    # LogisticRegression
    clf = LogisticRegression(class_weight=class_weight)
    grid_values = {
        'penalty': ['elasticnet'],
        'C': [0.001,0.01,.09,1,5,10,25],
        'l1_ratio': [0,0.25,.5,.75,1]
        }

    model, model_metrics = testModel(clf,grid_values,X_train_,y_train_,X_cv_,y_cv_)
    model_list.append(
        {'name': 'LogisticRegression',
        'model': model,
        'features_selection': features_selection,
        'class_weight': class_weight,
        'metrics': model_metrics  
        }
    )

    # SGDClassifier
    loss_list = [
        'hinge',            # SVM
        'log',              # Logistic Regression
        'modified_huber',   # Probabilistic classifier (smooth loss)
        'squared_hinge',    # Like hinge but quadratically penalized
    ]

    for loss in loss_list:
        grid_values = {
            'l1_ratio': [0,0.25,0.5,0.751], # (0 <= l1_ratio <= 1): l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1
            'alpha': [0.0001,0.001,0.01,0.1,1,10], # The higher the value, the stronger the regularization. Also used to compute the learning rate when set to learning_rate is set to ‘optimal’.
            'eta0': [0.001,0.1,1] # Initial learning rate
            }

        clf = SGDClassifier(class_weight=class_weight, verbose=0, max_iter=1000, tol=1e-3, penalty='elasticnet',
                            learning_rate='optimal', early_stopping=True, validation_fraction=0.1, 
                            n_iter_no_change=5, loss=loss)

        model, model_metrics = testModel(clf,grid_values,X_train_,y_train_,X_cv_,y_cv_)
        model_list.append(
            {'name': 'SGDClassifier_'+str(loss),
            'model': model,
            'features_selection': features_selection,
            'class_weight': class_weight,
            'metrics': model_metrics    
            }
        )

    return model_list

In [12]:
model_list_pca_unbalanced = train_models(X_train_pca, y_train, X_cv_pca, y_cv, features_selection='PCA',class_weight=None,add_dummy=True)
model_list_rfe_unbalanced = train_models(X_train_rfe, y_train, X_cv_rfe, y_cv, features_selection='RFE',class_weight=None,add_dummy=True)
model_list_pca = train_models(X_train_pca, y_train, X_cv_pca, y_cv, features_selection='PCA')
model_list_rfe = train_models(X_train_rfe, y_train, X_cv_rfe, y_cv, features_selection='RFE')

In [13]:
model_list = model_list_pca+model_list_rfe+model_list_pca_unbalanced+model_list_rfe_unbalanced

In [14]:
for m in model_list:
    print(str(m['name'])+" | Features selection: "+str(m['features_selection'])+ \
    "\n\t- Accuracy = "+str(m['metrics']['accuracy'])+"\n\t- Precision = "+str(m['metrics']['precision'])+ \
    "\n\t- Recall = "+str(m['metrics']['recall'])+"\n\t- f1_score = "+str(m['metrics']['f1_score'])+"\n")

LogisticRegression | Features selection: PCA
	- Accuracy = 0.7514242309153057
	- Precision = 0.378925741121221
	- Recall = 0.7200223089793641
	- f1_score = 0.4965384615384616

SGDClassifier_hinge | Features selection: PCA
	- Accuracy = 0.7456323585263958
	- Precision = 0.3724812895797352
	- Recall = 0.7216954824316788
	- f1_score = 0.49136130624644014

SGDClassifier_log | Features selection: PCA
	- Accuracy = 0.7471515381693885
	- Precision = 0.37492811960897066
	- Recall = 0.7272727272727273
	- f1_score = 0.4947827736672358

SGDClassifier_modified_huber | Features selection: PCA
	- Accuracy = 0.17024306874287884
	- Precision = 0.17024306874287884
	- Recall = 1.0
	- f1_score = 0.29095334685598373

SGDClassifier_squared_hinge | Features selection: PCA
	- Accuracy = 0.7529434105582985
	- Precision = 0.379577255135457
	- Recall = 0.7110987172336866
	- f1_score = 0.49495341614906835

LogisticRegression | Features selection: RFE
	- Accuracy = 0.7542726927459172
	- Precision = 0.381872213967

In [38]:
models_df = None
column_names = ["name","features_selection","class_weight","accuracy","precision","recall","f1_score","confusion_matrix","model"]
models_df = pd.DataFrame(columns=column_names)
for m in model_list:
    m_append = []
    m_append.append(m["name"])
    m_append.append(m["features_selection"])
    m_append.append(m["class_weight"])
    m_append.append(m["metrics"]["accuracy"])
    m_append.append(m["metrics"]["precision"])
    m_append.append(m["metrics"]["recall"])
    m_append.append(m["metrics"]["f1_score"])
    m_append.append(m["metrics"]["confusion_matrix"])
    m_append.append(m["model"])

    m_series = pd.Series(m_append, index = models_df.columns)
    models_df = models_df.append(m_series, ignore_index=True)
models_df

Unnamed: 0,name,features_selection,class_weight,accuracy,precision,recall,f1_score,confusion_matrix,model
0,LogisticRegression,PCA,balanced,0.751424,0.378926,0.720022,0.496538,"[[6623, 2116], [502, 1291]]",GridSearchCV(estimator=LogisticRegression(clas...
1,SGDClassifier_hinge,PCA,balanced,0.745632,0.372481,0.721695,0.491361,"[[6559, 2180], [499, 1294]]",GridSearchCV(estimator=SGDClassifier(class_wei...
2,SGDClassifier_log,PCA,balanced,0.747152,0.374928,0.727273,0.494783,"[[6565, 2174], [489, 1304]]",GridSearchCV(estimator=SGDClassifier(class_wei...
3,SGDClassifier_modified_huber,PCA,balanced,0.170243,0.170243,1.0,0.290953,"[[0, 8739], [0, 1793]]",GridSearchCV(estimator=SGDClassifier(class_wei...
4,SGDClassifier_squared_hinge,PCA,balanced,0.752943,0.379577,0.711099,0.494953,"[[6655, 2084], [518, 1275]]",GridSearchCV(estimator=SGDClassifier(class_wei...
5,LogisticRegression,RFE,balanced,0.754273,0.381872,0.716676,0.498255,"[[6659, 2080], [508, 1285]]",GridSearchCV(estimator=LogisticRegression(clas...
6,SGDClassifier_hinge,RFE,balanced,0.767566,0.39698,0.703848,0.507643,"[[6822, 1917], [531, 1262]]",GridSearchCV(estimator=SGDClassifier(class_wei...
7,SGDClassifier_log,RFE,balanced,0.782947,0.41262,0.649191,0.504551,"[[7082, 1657], [629, 1164]]",GridSearchCV(estimator=SGDClassifier(class_wei...
8,SGDClassifier_modified_huber,RFE,balanced,0.76918,0.398601,0.699387,0.507795,"[[6847, 1892], [539, 1254]]",GridSearchCV(estimator=SGDClassifier(class_wei...
9,SGDClassifier_squared_hinge,RFE,balanced,0.170243,0.170243,1.0,0.290953,"[[0, 8739], [0, 1793]]",GridSearchCV(estimator=SGDClassifier(class_wei...


In [15]:
# Save
open_file = open('./data/model_list.pkl', "wb")
pickle.dump(model_list, open_file)
open_file.close()

models_df.to_pickle('./data/models_df.pkl')


# Load
# open_file = open('./data/model_list.pkl', "rb")
# model_list = pickle.load(open_file)
# open_file.close()

# models_df = pd.read_pickle('./data/models_df.pkl')

In [54]:
metrics_list=["accuracy","precision","recall","f1_score"]
for met in metrics_list:
    idx = models_df[met].argmax()
    name = models_df.iloc[idx]["name"] 
    features_selection = models_df.iloc[idx]["features_selection"] 
    class_weight = models_df.iloc[idx]["class_weight"]
    value = models_df.iloc[idx][met]
    print("best "+str(met)+" = "+"{:10.2f}".format(value)+":\t"+str(name)+" (features="+str(features_selection)+", class_weight="+str(class_weight)+")")


best accuracy =       0.86:	LogisticRegression (features=PCA, class_weight=None)
best precision =       0.71:	SGDClassifier_hinge (features=RFE, class_weight=None)
best recall =       1.00:	SGDClassifier_modified_huber (features=PCA, class_weight=balanced)
best f1_score =       0.51:	SGDClassifier_modified_huber (features=RFE, class_weight=balanced)


In [47]:
models_df.iloc[11]["name"]

'LogisticRegression'