# Import libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import sklearn
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import math
import pickle

import warnings
warnings.filterwarnings("ignore")

# Train-Test
from sklearn.model_selection import train_test_split

# Feature selection
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE

# Regression models
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

# Metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

## Load data from pickle file

In [2]:
df_train_processed = pd.read_pickle('./data/df_train_processed.pkl')

open_file = open('./data/param_dict.pkl', "rb")
param_dict = pickle.load(open_file)
open_file.close()

df_test_processed = pd.read_pickle('./data/df_test_processed.pkl')

In [3]:
# Split into train/cv
X_train, X_cv, y_train, y_cv = train_test_split(
    df_train_processed.drop(['RainTomorrow','RainfallTomorrow'], axis=1),
    df_train_processed['RainfallTomorrow'],
    test_size=0.1,
    random_state=0)

In [4]:


def print_results(y_pred,y_true):
    print("MSE = "+"{:10.2f}".format(mean_squared_error(y_pred, y_true, squared=True)))
    print("RMSE = "+"{:10.2f}".format(mean_squared_error(y_pred, y_true, squared=False)))

## Feature selection: PCA

In [5]:
explained_variance = .95
pca = PCA(n_components=explained_variance).fit(X_train)

X_train_pca = pca.transform(X_train)
X_cv_pca = pca.transform(X_cv)

# pca = PCA(n_components=explained_variance).fit(df_train_processed)
# df_train_pca = pca.transform(df_train_processed)

print("Number of components required to explain "+str(explained_variance)+"% of the variance = "+str(X_train_pca.shape[1]))

Number of components required to explain 0.95% of the variance = 15


## Feature selection: RFE (with simple Logistic Regression)

In [6]:
logisticRegr = LinearRegression()

rfe = RFE(estimator=logisticRegr, step=1, verbose=0, n_features_to_select=15)
rfe = rfe.fit(X_train, y_train.values.ravel())

total_cols = np.array(X_train.columns.values.tolist())
selected_cols = total_cols[rfe.support_].tolist()
X_train_rfe = X_train[selected_cols]
X_cv_rfe =  X_cv[selected_cols]
print("Columns selected: "+str(selected_cols))

Columns selected: ['Rainfall', 'WindGustSpeed', 'WindSpeed3pm', 'Humidity3pm', 'RainToday', 'LocationType_0', 'LocationType_1', 'LocationType_2', 'LocationType_3', 'LocationType_4', 'PressureMean', 'imputed_mean', 'WindDir9am_cos', 'WindDir9am_sin', 'WindDir3pm_sin']


In [13]:
def train_models(X_train_ ,y_train_ ,X_cv_ ,y_cv_,features_selection='',degrees=[2,3,4,5]):

    model_list=[]

    # DummyRegressor
    reg = DummyRegressor(strategy="mean")
    reg.fit(X_train_, y_train_)
    y_pred = reg.predict(X_cv_)

    model_list.append(
        {'name': 'DummyRegressor',
        'features_selection': features_selection,
        'R2':  reg.score(X_cv_, y_cv_),
        'MSE': mean_squared_error(y_pred, y_cv_, squared=True),
        'RMSE': mean_squared_error(y_pred, y_cv_, squared=False),
        # 'r2_score:': r2_score(y_pred,y_cv_),
        'model': reg
        }
    )

    # LinearRegression
    reg = LinearRegression()
    reg.fit(X_train_, y_train_)
    y_pred = reg.predict(X_cv_)

    model_list.append(
        {'name': 'LinearRegression',
        'features_selection': features_selection,
        'R2': reg.score(X_cv_, y_cv_),
        'MSE': mean_squared_error(y_pred, y_cv_, squared=True),
        'RMSE': mean_squared_error(y_pred, y_cv_, squared=False),
        # 'r2_score:': r2_score(y_pred,y_cv_),
        'model': reg
        }
    )

    # PolynomialFeatures
    for d in degrees:
        poly_reg = PolynomialFeatures(degree=d,interaction_only=True)
        X_poly = poly_reg.fit_transform(X_train_)

        pol_reg = LinearRegression()
        pol_reg.fit(X_poly, y_train_)

        y_pred = pol_reg.predict(poly_reg.fit_transform(X_cv_))
        model_list.append(
            {'name': 'PolynomialFeatures_'+str(d),
            'features_selection': features_selection,
            'R2': None,
            'MSE': mean_squared_error(y_pred, y_cv_, squared=True),
            'RMSE': mean_squared_error(y_pred, y_cv_, squared=False),
             # 'r2_score:': r2_score(y_pred,y_cv_),
            'model': reg
        }
    )

    return model_list

In [14]:
model_list=[]
        
model_list_pca = train_models(X_train_pca, y_train, X_cv_pca, y_cv, features_selection='PCA')
model_list_rfe = train_models(X_train_rfe, y_train, X_cv_rfe, y_cv, features_selection='RFE')

model_list = model_list_pca+model_list_rfe

In [18]:
models_df = None
column_names = ["name","features_selection","RMSE","MSE","model"]
models_df = pd.DataFrame(columns=column_names)
for m in model_list:
    m_append = []
    for col in column_names:
        m_append.append(m[col])
    m_series = pd.Series(m_append, index = models_df.columns)
    models_df = models_df.append(m_series, ignore_index=True)
models_df

Unnamed: 0,name,features_selection,RMSE,MSE,model
0,DummyRegressor,pca,0.9958106,0.9916387,DummyRegressor()
1,LinearRegression,pca,0.8926976,0.7969089,LinearRegression()
2,PolynomialFeatures_2,pca,0.8655213,0.7491271,LinearRegression()
3,PolynomialFeatures_3,pca,0.8537782,0.7289372,LinearRegression()
4,PolynomialFeatures_4,pca,0.8586897,0.737348,LinearRegression()
5,PolynomialFeatures_5,pca,0.8988511,0.8079334,LinearRegression()
6,DummyRegressor,rfe,0.9958106,0.9916387,DummyRegressor()
7,LinearRegression,rfe,0.8896725,0.7915172,LinearRegression()
8,PolynomialFeatures_2,rfe,0.8609494,0.7412339,LinearRegression()
9,PolynomialFeatures_3,rfe,0.8583342,0.7367376,LinearRegression()


In [19]:
# Save
open_file = open('./data/reg_list.pkl', "wb")
pickle.dump(model_list, open_file)
open_file.close()

models_df.to_pickle('./data/reg_df.pkl')


# Load
# open_file = open('./data/reg_list.pkl', "rb")
# model_list = pickle.load(open_file)
# open_file.close()

# models_df = pd.read_pickle('./data/reg_df.pkl')

In [21]:
metrics_list=["RMSE"]
for met in metrics_list:
    idx = models_df[met].argmin()
    name = models_df.iloc[idx]["name"] 
    features_selection = models_df.iloc[idx]["features_selection"] 
    value = models_df.iloc[idx][met]
    print("best "+str(met)+" = "+"{:10.2f}".format(value)+":\t"+str(name)+" ("+str(features_selection)+")")


best RMSE =       0.85:	PolynomialFeatures_3 (pca)
