# Import libraries

In [2]:
# Import libraries
import pandas as pd
import numpy as np
import sklearn
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import scipy.stats as stats
import math
from geopy.geocoders import Nominatim
import re
import pickle

import warnings
warnings.filterwarnings("ignore")

# Train-Test
from sklearn.model_selection import train_test_split

# Normalization
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer

# Feature selection
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE

# Classification models
from sklearn.dummy import DummyClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression

from sklearn import metrics

## Load data from pickle file

In [3]:
df_train_processed = pd.read_pickle('./data/df_train_processed.pkl')

open_file = open('./data/param_dict.pkl', "rb")
param_dict = pickle.load(open_file)
open_file.close()

df_test_processed = pd.read_pickle('./data/df_test_processed.pkl')

In [5]:
# X_train = df_train_processed.drop(['RainTomorrow','RainfallTomorrow'],axis=1)
# y_train = df_train_processed.RainTomorrow

# X_test = df_test_processed.drop(['RainTomorrow','RainfallTomorrow'],axis=1)
# y_test = df_test_processed.RainTomorrow

# Split into train/cv
X_train, X_cv, y_train, y_cv = train_test_split(
    df_train_processed.drop(['RainTomorrow','RainfallTomorrow'], axis=1),
    df_train_processed['RainTomorrow'],
    test_size=0.1,
    random_state=0)

In [4]:
def print_results(y,pred):
    print("accuracy = "+str(metrics.accuracy_score(y, pred)))
    print("precision = "+str(metrics.precision_score(y, pred)))
    print("recall = "+str(metrics.recall_score(y, pred)))
    print("f1_score = "+str(metrics.f1_score(y, pred)))
    print("\nconfusion matrix:")
    print(metrics.confusion_matrix(y, pred))
    # print(metrics.classification_report(y_cv, predictions))

## Feature selection: PCA

In [6]:
explained_variance = .95
pca = PCA(n_components=explained_variance).fit(X_train)

X_train_pca = pca.transform(X_train)
X_cv_pca = pca.transform(X_cv)

# pca = PCA(n_components=explained_variance).fit(df_train_processed)
# df_train_pca = pca.transform(df_train_processed)

print("Number of components required to explain "+str(explained_variance)+"% of the variance = "+str(X_train_pca.shape[1]))

Number of components required to explain 0.95% of the variance = 15


## Feature selection: RFE (with simple Logistic Regression)

In [12]:
logisticRegr = LogisticRegression(class_weight='balanced')

rfe = RFE(estimator=logisticRegr, step=1, verbose=0, n_features_to_select=15)
rfe = rfe.fit(X_train, y_train.values.ravel())

total_cols = np.array(X_train.columns.values.tolist())
selected_cols = total_cols[rfe.support_].tolist()
X_train_rfe = X_train[selected_cols]
X_cv_rfe =  X_cv[selected_cols]
print("Columns selected: "+str(selected_cols))

Columns selected: ['Rainfall', 'WindGustSpeed', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'RainToday', 'LocationType_0', 'LocationType_2', 'LocationType_3', 'LocationType_4', 'PressureMean', 'TempMaxDiff', 'imputed_mean', 'WindDir9am_sin', 'WindDir3pm_sin']


In [37]:
def testModel(clf_,grid_values_,X_train_,y_train_,X_cv_,y_cv_,scoring_='recall',verbose_=0):
    # model.get_params() # Return selected params
    # cv=None -> None, to use the default 5-fold cross validation (K-Fold, k=5)
    model_ = GridSearchCV(clf_, param_grid = grid_values_, cv=None, scoring=scoring_,verbose=verbose_)
    model_.fit(X_train_, y_train_)

    y_pred = model_.predict(X_cv_)
    metrics_={    
        "accuracy":         metrics.accuracy_score(y_cv_, y_pred),
        "precision":        metrics.precision_score(y_cv_, y_pred),
        "recall":           metrics.recall_score(y_cv_, y_pred),
        "f1_score":         metrics.f1_score(y_cv_, y_pred),
        "confusion_matrix": metrics.confusion_matrix(y_cv_, y_pred)
    }
    return model_, metrics_

def train_models(X_train_, y_train_, X_cv_, y_cv_, features_selection=''):
    model_list=[]

    # Dummy Classifier
    clf = DummyClassifier(strategy= 'most_frequent')
    grid_values={}
    model, model_metrics = testModel(clf,grid_values,X_train_,y_train_,X_cv_,y_cv_)
    model_list.append(
        {'name': 'DummyClassifier',
        'model': model,
        'features_selection': features_selection,
        'metrics': model_metrics
        }
    )

    # LogisticRegression
    clf = LogisticRegression(class_weight='balanced')
    grid_values = {
        'penalty': ['l1', 'l2', 'elasticnet'],
        'C': [0.001,0.01,.09,1,5,10,25],
        'l1_ratio': [.25,.5,.75]
        }

    model, model_metrics = testModel(clf,grid_values,X_train_,y_train_,X_cv_,y_cv_)
    model_list.append(
        {'name': 'LogisticRegression',
        'model': model,
        'features_selection': features_selection,
        'metrics': model_metrics  
        }
    )

    # SGDClassifier
    loss_list = [
        'hinge',            # SVM
        'log',              # Logistic Regression
        'modified_huber',   # Probabilistic classifier (smooth loss)
        'squared_hinge',    # Like hinge but quadratically penalized
    ]

    for loss in loss_list:
        grid_values = {
            'l1_ratio': [0,0.25,0.5,0.751], # (0 <= l1_ratio <= 1): l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1
            'alpha': [0.0001,0.001,0.01,0.1,1,10], # The higher the value, the stronger the regularization. Also used to compute the learning rate when set to learning_rate is set to ‘optimal’.
            'eta0': [0.001,0.1,1] # Initial learning rate
            }

        clf = SGDClassifier(class_weight='balanced', verbose=0, max_iter=1000, tol=1e-3, penalty='elasticnet',
                            learning_rate='optimal', early_stopping=True, validation_fraction=0.1, 
                            n_iter_no_change=5, loss=loss)

        model, model_metrics = testModel(clf,grid_values,X_train_,y_train_,X_cv_,y_cv_)
        model_list.append(
            {'name': 'SGDClassifier_'+str(loss),
            'model': model,
            'features_selection': features_selection,
            'metrics': model_metrics    
            }
        )

    return model_list

In [38]:
model_list_rfe = train_models(X_train_rfe, y_train, X_cv_rfe, y_cv, features_selection='RFE')

In [39]:
model_list_pca = train_models(X_train_pca, y_train, X_cv_pca, y_cv, features_selection='PCA')

In [40]:
model_list = model_list_pca+model_list_rfe

In [41]:
for m in model_list:
    print(str(m['name'])+" | Features selection: "+str(m['features_selection'])+ \
    "\n\t- Accuracy = "+str(m['metrics']['accuracy'])+"\n\t- Precision = "+str(m['metrics']['precision'])+ \
    "\n\t- Recall = "+str(m['metrics']['recall'])+"\n\t- f1_score = "+str(m['metrics']['f1_score'])+"\n")

DummyClassifier | Features selection: PCA
	- Accuracy = 0.8297569312571211
	- Precision = 0.0
	- Recall = 0.0
	- f1_score = 0.0

LogisticRegression | Features selection: PCA
	- Accuracy = 0.7513292821876187
	- Precision = 0.3788145539906103
	- Recall = 0.7200223089793641
	- f1_score = 0.49644299173235906

SGDClassifier_hinge | Features selection: PCA
	- Accuracy = 0.6119445499430307
	- Precision = 0.2735886300829056
	- Recall = 0.7730061349693251
	- f1_score = 0.4041405452689896

SGDClassifier_log | Features selection: PCA
	- Accuracy = 0.6419483478921383
	- Precision = 0.29708658186294623
	- Recall = 0.807585052983826
	- f1_score = 0.4343782810859457

SGDClassifier_modified_huber | Features selection: PCA
	- Accuracy = 0.7409798708697304
	- Precision = 0.36600745199197476
	- Recall = 0.7122141662018963
	- f1_score = 0.4835289663006437

SGDClassifier_squared_hinge | Features selection: PCA
	- Accuracy = 0.8297569312571211
	- Precision = 0.0
	- Recall = 0.0
	- f1_score = 0.0

DummyClass

In [42]:
# Save
# open_file = open('./data/model_list.pkl', "wb")
# pickle.dump(model_list, open_file)
# open_file.close()

# Load
# open_file = open('./data/model_list.pkl', "rb")
# model_list = pickle.load(open_file)
# open_file.close()