# Import libraries

In [2]:
# Import libraries
import pandas as pd
import numpy as np
import sklearn
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import math
import pickle

import warnings
warnings.filterwarnings("ignore")

# Train-Test
from sklearn.model_selection import train_test_split

# Normalization
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer

# Feature selection
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE

# Regression models
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

## Load data from pickle file

In [3]:
df_train_processed = pd.read_pickle('./data/df_train_processed.pkl')

open_file = open('./data/param_dict.pkl', "rb")
param_dict = pickle.load(open_file)
open_file.close()

df_test_processed = pd.read_pickle('./data/df_test_processed.pkl')

In [4]:
# Split into train/cv
X_train, X_cv, y_train, y_cv = train_test_split(
    df_train_processed.drop(['RainTomorrow','RainfallTomorrow'], axis=1),
    df_train_processed['RainfallTomorrow'],
    test_size=0.1,
    random_state=0)

In [5]:
from sklearn.metrics import mean_squared_error

def print_results(y_pred,y_true):
    print("MSE = "+"{:10.2f}".format(mean_squared_error(y_pred, y_true, squared=True)))
    print("RMSE = "+"{:10.2f}".format(mean_squared_error(y_pred, y_true, squared=False)))

## Feature selection: PCA

In [6]:
explained_variance = .95
pca = PCA(n_components=explained_variance).fit(X_train)

X_train_pca = pca.transform(X_train)
X_cv_pca = pca.transform(X_cv)

# pca = PCA(n_components=explained_variance).fit(df_train_processed)
# df_train_pca = pca.transform(df_train_processed)

print("Number of components required to explain "+str(explained_variance)+"% of the variance = "+str(X_train_pca.shape[1]))

Number of components required to explain 0.95% of the variance = 15


## Feature selection: RFE (with simple Logistic Regression)

In [7]:
# logisticRegr = LogisticRegression(class_weight='balanced')

# rfe = RFE(estimator=logisticRegr, step=1, verbose=0, n_features_to_select=15)
# rfe = rfe.fit(X_train, y_train.values.ravel())

# total_cols = np.array(X_train.columns.values.tolist())
# selected_cols = total_cols[rfe.support_].tolist()
# X_train_rfe = X_train[selected_cols]
# X_cv_rfe =  X_cv[selected_cols]
# print("Columns selected: "+str(selected_cols))

In [8]:
X_train_ ,y_train_ ,X_cv_ ,y_cv_ = X_train_pca, y_train, X_cv_pca, y_cv

# DummyRegressor
reg = DummyRegressor(strategy="mean")
reg.fit(X_train_, y_train_)
y_pred = reg.predict(X_cv_)

print("R2 Score = "+"{:10.2f}".format(reg.score(X_cv_, y_cv_)))
print_results(y_pred,y_cv_)

# LinearRegression
reg = LinearRegression().fit(X_train_, y_train_)
y_pred = reg.predict(X_cv_)

print("R2 Score = "+"{:10.2f}".format(reg.score(X_cv_, y_cv_)))
print_results(y_pred,y_cv_)



R2 Score =      -0.00
MSE =       0.99
RMSE =       1.00
R2 Score =       0.20
MSE =       0.80
RMSE =       0.89


In [11]:
from sklearn.metrics import r2_score
# Fitting Polynomial Regression to the dataset
degrees=[2,3,4,5,6,7]
poly_models = []
for d in degrees:
    poly_reg = PolynomialFeatures(degree=d)
    X_poly = poly_reg.fit_transform(X_train_)
    pol_reg = LinearRegression()
    pol_reg.fit(X_poly, y_train_)
    
    poly_models.append(pol_reg)

    y_pred = pol_reg.predict(poly_reg.fit_transform(X_cv_))
    
    print("\n---------------------------------")
    print("d = "+str(d))
    print_results(y_pred,y_cv_)


---------------------------------
d = 2
MSE =       0.73
RMSE =       0.86

---------------------------------
d = 3
MSE =       0.71
RMSE =       0.84

---------------------------------
d = 4
MSE =       0.74
RMSE =       0.86
