# Import libraries

In [78]:
# Import libraries
import pandas as pd
import numpy as np
import sklearn
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import math
import pickle

import warnings
warnings.filterwarnings("ignore")

# Train-Test
from sklearn.model_selection import train_test_split

# Feature selection
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE

# Regression models
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV

from sklearn.model_selection import GridSearchCV

# Metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

## Load data from pickle file

In [10]:
df_train_processed = pd.read_pickle('./data/df_train_processed.pkl')

open_file = open('./data/df_param_dict.pkl', "rb")
param_dict = pickle.load(open_file)
open_file.close()

df_test_processed = pd.read_pickle('./data/df_test_processed.pkl')

In [11]:
df_train_processed = df_train_processed[df_train_processed['RainTomorrow'].notna()]
df_test_processed = df_test_processed[df_test_processed['RainTomorrow'].notna()]

# Split into train/cv
X_train, X_cv, y_train, y_cv = train_test_split(
    df_train_processed.drop(['RainTomorrow','RainfallTomorrow'], axis=1),
    df_train_processed['RainfallTomorrow'],
    test_size=0.1,
    random_state=0)

In [12]:
def print_results(y_pred,y_true):
    print("MSE = "+"{:10.2f}".format(mean_squared_error(y_pred, y_true, squared=True)))
    print("RMSE = "+"{:10.2f}".format(mean_squared_error(y_pred, y_true, squared=False)))

## Feature selection: PCA

In [13]:
explained_variance = .95
pca = PCA(n_components=explained_variance).fit(X_train)

X_train_pca = pca.transform(X_train)
X_cv_pca = pca.transform(X_cv)

# pca = PCA(n_components=explained_variance).fit(df_train_processed)
# df_train_pca = pca.transform(df_train_processed)

print("Number of components required to explain "+str(explained_variance)+"% of the variance = "+str(X_train_pca.shape[1]))

Number of components required to explain 0.95% of the variance = 15


## Feature selection: RFE (with simple Logistic Regression)

In [14]:
logisticRegr = LinearRegression()

rfe = RFE(estimator=logisticRegr, step=1, verbose=0, n_features_to_select=15)
rfe = rfe.fit(X_train, y_train.values.ravel())

total_cols = np.array(X_train.columns.values.tolist())
selected_cols = total_cols[rfe.support_].tolist()
X_train_rfe = X_train[selected_cols]
X_cv_rfe =  X_cv[selected_cols]
print("Columns selected: "+str(selected_cols))

Columns selected: ['Rainfall', 'WindGustSpeed', 'WindSpeed3pm', 'Humidity3pm', 'RainToday', 'LocationType_1', 'LocationType_2', 'LocationType_3', 'LocationType_4', 'PressureMean', 'TempMean', 'imputed_mean', 'WindDir9am_cos', 'WindDir9am_sin', 'WindDir3pm_sin']


In [91]:

import time

import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LassoCV, LassoLarsCV, LassoLarsIC

def train_models(X_train_ ,y_train_ ,X_cv_ ,y_cv_,features_selection='',degrees=[2,3,4,5]):

    model_list=[]

    # DummyRegressor
    reg = DummyRegressor(strategy="mean")
    reg.fit(X_train_, y_train_)
    y_pred = reg.predict(X_cv_)

    model_list.append(
        {'name': 'DummyRegressor',
        'features_selection': features_selection,
        'score':  reg.score(X_cv_, y_cv_),
        'MSE': mean_squared_error(y_pred, y_cv_, squared=True),
        'RMSE': mean_squared_error(y_pred, y_cv_, squared=False),
        'r2_score': r2_score(y_pred,y_cv_),
        'model': reg
        }
    )

    # LinearRegression
    reg = LinearRegression()
    reg.fit(X_train_, y_train_)
    y_pred = reg.predict(X_cv_)

    model_list.append(
        {'name': 'LinearRegression',
        'features_selection': features_selection,
        'score': reg.score(X_cv_, y_cv_),
        'MSE': mean_squared_error(y_pred, y_cv_, squared=True),
        'RMSE': mean_squared_error(y_pred, y_cv_, squared=False),
        'r2_score': r2_score(y_pred,y_cv_),
        'model': reg
        }
    )

    # # PolynomialFeatures
    # for d in degrees:
    #     print("running ploly d="+str(d))
    #     poly_reg = PolynomialFeatures(degree=d,interaction_only=True)
    #     X_poly = poly_reg.fit_transform(X_train_)

    #     pol_reg = LinearRegression()
    #     pol_reg.fit(X_poly, y_train_)

    #     y_pred = pol_reg.predict(poly_reg.transform(X_cv_))
    #     model_list.append(
    #         {'name': 'PolynomialFeatures_'+str(d),
    #         'features_selection': features_selection,
    #         'score': pol_reg.score(poly_reg.transform(X_cv_), y_cv_),
    #         'MSE': mean_squared_error(y_pred, y_cv_, squared=True),
    #         'RMSE': mean_squared_error(y_pred, y_cv_, squared=False),
    #         'r2_score': r2_score(y_pred,y_cv_),
    #         'model': reg
    #     }
    # )
    
    # Ridge
    reg = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1], cv=10)
    reg.fit(X_train_, y_train_)
    y_pred = reg.predict(X_cv_)

    model_list.append(
        {'name': 'Ridge_l2',
        'features_selection': features_selection,
        'score': reg.score(X_cv_, y_cv_),
        'MSE': mean_squared_error(y_pred, y_cv_, squared=True),
        'RMSE': mean_squared_error(y_pred, y_cv_, squared=False),
        'r2_score': r2_score(y_pred,y_cv_),
        'model': reg
        }
    )

    # Lasso
    reg = LassoCV(alphas=[1e-3, 1e-2, 1e-1, 1,2], cv=10)
    reg.fit(X_train_, y_train_)

    y_pred = reg.predict(X_cv_)

    model_list.append(
        {'name': 'Lasso_l1',
        'features_selection': features_selection,
        'score': reg.score(X_cv_, y_cv_),
        'MSE': mean_squared_error(y_pred, y_cv_, squared=True),
        'RMSE': mean_squared_error(y_pred, y_cv_, squared=False),
        'r2_score': r2_score(y_pred,y_cv_),
        'model': reg
        }
    )

    # ElasticNet
    reg = ElasticNetCV(alphas=[1e-3, 1e-2, 1e-1, 1], 
                        l1_ratio= [.1, .5, .7, .9, .95, .99, 1], cv=10)
    reg.fit(X_train_, y_train_)

    y_pred = reg.predict(X_cv_)

    model_list.append(
        {'name': 'ElasticNet',
        'features_selection': features_selection,
        'score': reg.score(X_cv_, y_cv_),
        'MSE': mean_squared_error(y_pred, y_cv_, squared=True),
        'RMSE': mean_squared_error(y_pred, y_cv_, squared=False),
        'r2_score': r2_score(y_pred,y_cv_),
        'model': reg
        }
    )

    return model_list

In [92]:
model_list=[]
        
model_list_pca = train_models(X_train_pca, y_train, X_cv_pca, y_cv, features_selection='PCA')
model_list_rfe = train_models(X_train_rfe, y_train, X_cv_rfe, y_cv, features_selection='RFE')

model_list = model_list_pca+model_list_rfe

In [93]:
models_df = None
column_names = ["name","features_selection","RMSE","MSE","score","r2_score","model"]
models_df = pd.DataFrame(columns=column_names)
for m in model_list:
    m_append = []
    for col in column_names:
        m_append.append(m[col])
    m_series = pd.Series(m_append, index = models_df.columns)
    models_df = models_df.append(m_series, ignore_index=True)
models_df

Unnamed: 0,name,features_selection,RMSE,MSE,score,r2_score,model
0,DummyRegressor,PCA,1.001423,1.002848,-2.4e-05,-8.531285e+37,DummyRegressor()
1,LinearRegression,PCA,0.898812,0.807863,0.194411,-2.969508,LinearRegression()
2,Ridge_l2,PCA,0.898812,0.807863,0.194411,-2.969576,"RidgeCV(alphas=array([0.001, 0.01 , 0.1 , 1. ..."
3,Lasso_l1,PCA,0.898768,0.807783,0.194491,-3.023082,"LassoCV(alphas=[0.001, 0.01, 0.1, 1, 2], cv=10)"
4,ElasticNet,PCA,0.8988,0.807842,0.194432,-2.981002,"ElasticNetCV(alphas=[0.001, 0.01, 0.1, 1], cv=..."
5,DummyRegressor,RFE,1.001423,1.002848,-2.4e-05,-8.531285e+37,DummyRegressor()
6,LinearRegression,RFE,0.895677,0.802238,0.200021,-2.821492,LinearRegression()
7,Ridge_l2,RFE,0.895677,0.802238,0.200021,-2.821583,"RidgeCV(alphas=array([0.001, 0.01 , 0.1 , 1. ..."
8,Lasso_l1,RFE,0.895775,0.802413,0.199847,-2.876537,"LassoCV(alphas=[0.001, 0.01, 0.1, 1, 2], cv=10)"
9,ElasticNet,RFE,0.895682,0.802246,0.200013,-2.83518,"ElasticNetCV(alphas=[0.001, 0.01, 0.1, 1], cv=..."


In [101]:
for m in model_list:
        if m['name']=='Ridge_l2' or m['name']=='Lasso_l1' or m['name']=='ElasticNet':
            if m['name']=='ElasticNet':
                print(str(m['name'])+" ("+str(m['features_selection'])+"): alpha = "+str(m['model'].alpha_)+" | l1_ratio = "+str(m['model'].l1_ratio_))
            else:
                print(str(m['name']+" ("+str(m['features_selection'])+"): alpha = "+str(m['model'].alpha_)))

Ridge_l2 (PCA): alpha = 1.0
Lasso_l1 (PCA): alpha = 0.001
ElasticNet (PCA): alpha = 0.001 | l1_ratio = 0.1
Ridge_l2 (RFE): alpha = 1.0
Lasso_l1 (RFE): alpha = 0.001
ElasticNet (RFE): alpha = 0.001 | l1_ratio = 0.1


In [6]:
# # Save
# open_file = open('./data/reg_list.pkl', "wb")
# pickle.dump(model_list, open_file)
# open_file.close()

# models_df.to_pickle('./data/reg_df.pkl')


# Load
open_file = open('./data/reg_list.pkl', "rb")
model_list = pickle.load(open_file)
open_file.close()

models_df = pd.read_pickle('./data/reg_df.pkl')

In [7]:
metrics_list=["RMSE","r2_score"]
for met in metrics_list:
    idx = models_df[met].argmin()
    name = models_df.iloc[idx]["name"] 
    features_selection = models_df.iloc[idx]["features_selection"] 
    value = models_df.iloc[idx][met]
    print("best "+str(met)+" = "+"{:10.2f}".format(value)+":\t"+str(name)+" ("+str(features_selection)+")")
metrics_list=["score"]
for met in metrics_list:
    idx = models_df[met].argmax()
    name = models_df.iloc[idx]["name"] 
    features_selection = models_df.iloc[idx]["features_selection"] 
    value = models_df.iloc[idx][met]
    print("best "+str(met)+" = "+"{:10.2f}".format(value)+":\t"+str(name)+" ("+str(features_selection)+")")

best RMSE =       0.86:	PolynomialFeatures_3 (PCA)


KeyError: 'r2_score'