In [None]:
import pandas as pd
import numpy as np
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
import matplotlib.pyplot as plt
import seaborn as sns
import cufflinks
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)
from sklearn.metrics import mean_squared_error,mean_absolute_error
from statsmodels.tools.eval_measures import rmse
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
pd.set_option('display.max_row', 100)
pd.set_option('display.max_column', 150)

import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
Submission = pd.read_csv("sampleSubmission.csv")

In [None]:
train_x = train.drop("revenue",axis=1)
train_y = pd.DataFrame(train["revenue"])

In [None]:
print(train_x.shape)
print(test.shape)

In [None]:
data = pd.concat([train_x,test])
print(data.shape)
print(data.isnull().sum().sum())

In [None]:
def create_dummies(df,column_name):
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

In [None]:
def feature_engineer(df):
    df = df.astype(float,errors="ignore")
    df["Open Date"] = pd.to_datetime(df["Open Date"], format='%m/%d/%Y')
    df = df.set_index("Open Date")
    df["Open_Year"] = df.index.year
    df["Open_Month"] = df.index.month
    df = df.reset_index(drop=True)
    df = create_dummies(df,"Type")
    df = create_dummies(df,"City Group")
    df = create_dummies(df,"City")
    df = create_dummies(df,"Open_Year")
    df = create_dummies(df,"Open_Month")
    df = df.drop(["City","City Group","Type","Open_Year","Open_Month"],axis=1)
    return df

In [None]:
data_rf = feature_engineer(data)

In [None]:
train_rf_x = data_rf[:137]
test_x = data_rf[137:]
print(train_rf_x.shape)
print(test_x.shape)

In [None]:
train_rf_x = train_rf_x.drop("Id",axis=1)
Submission_id = pd.DataFrame(test_x["Id"]).astype(int).reset_index(drop=True)
test_rf_x = test_x.drop("Id",axis=1).reset_index(drop=True)

In [None]:
print(train_rf_x.shape) # Perform features normalization
print(train_y.shape)
print(test_rf_x.shape) # Perform features normalization
print(Submission_id.shape)

In [None]:
# Transform features only, and need no inverse_transform back
from sklearn.preprocessing import StandardScaler
train_score_scaler = StandardScaler()
test_score_scaler = StandardScaler()
normalize_train = pd.DataFrame(train_score_scaler.fit_transform(train_rf_x.loc[:,"P1":"P37"]),columns = train_rf_x.loc[:,"P1":"P37"].columns)
normalize_test = pd.DataFrame(test_score_scaler.fit_transform(test_rf_x.loc[:,"P1":"P37"]),columns = test_rf_x.loc[:,"P1":"P37"].columns)
# z- score transformation for train_rf_x & test_rf_x
train_rf_x.loc[:,"P1":"P37"] = normalize_train
test_rf_x.loc[:,"P1":"P37"] = normalize_test
print(test_rf_x.isnull().sum().sum())
print(train_rf_x.loc[:,"P1":"P37"].equals(normalize_train))

In [None]:
# transformation for Train_y
train_y_scaler = StandardScaler()
normalize_train_y = pd.DataFrame(train_y_scaler.fit_transform(train_y),columns = train_y.columns)
log_transform_y = train_y.apply(np.log)

# Feature Engineering Part_1: Remove Low Correlation features

In [None]:
def plot_correlation_heatmap(df):
    corr = df.corr()
    
    sns.set(style="white")
    mask = np.zeros_like(corr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True

    f, ax = plt.subplots(figsize=(20, 20))
    cmap = sns.diverging_palette(220, 10, as_cmap=True)


    sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
    plt.show()

In [None]:
features_train = pd.concat([train_rf_x,log_transform_y],axis=1)

In [None]:
heat_map_columns = ['P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'P10', 'P11',
       'P12', 'P13', 'P14', 'P15', 'P16', 'P17', 'P18', 'P19', 'P20', 'P21',
       'P22', 'P23', 'P24', 'P25', 'P26', 'P27', 'P28', 'P29', 'P30', 'P31',
       'P32', 'P33', 'P34', 'P35', 'P36', 'P37',"revenue"]

In [None]:
plot_correlation_heatmap(features_train[heat_map_columns])

In [None]:
# Sort the correlation values with the target columns revenue only
features_train_revenue_corr = features_train[features_train.columns[:]].corr()['revenue'][:-1].abs().sort_values(ascending=False)
revenue_corr_filter = features_train_revenue_corr[features_train_revenue_corr > 0.04]
print(features_train_revenue_corr)

In [None]:
revenue_corr_filter_columns = revenue_corr_filter.index

In [None]:
train_rf_x_engine = train_rf_x[revenue_corr_filter_columns]
test_rf_x_engine = test_rf_x[revenue_corr_filter_columns]

# Random Forest Regressor, Grid Search with only significant correlation Features & log_transformation_y

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [None]:
hyperparameters = {"criterion": ["mse"],
                   "max_depth": [None],   # Use for regularization, prevent overfitting
                   "max_features": ["auto","log2", "sqrt",0.5], # Use for regularization, prevent overfitting
                   "min_samples_split": [2,4], # Use for regularization, prevent overfitting
                   "n_estimators": [300,400,500,600,700,800,900,1000],
                   "oob_score": [True,False],
                  }

In [None]:
hyperparameters2 = {"criterion": ["mse"],
                   "max_depth": [None],   # Use for regularization, prevent overfitting
                   "max_features": ["auto","log2", "sqrt"], # Use for regularization, prevent overfitting
                   "min_samples_split": [2,4], # Use for regularization, prevent overfitting
                   "n_estimators": [30,60,80,100,200,300,400,500] # Many ensemble trees reduces overfitting
                  }

In [None]:
cls = RandomForestRegressor()
grid = GridSearchCV(cls,param_grid=hyperparameters2,cv=6)
grid.fit(train_rf_x_engine, log_transform_y)

In [None]:
best_params = grid.best_params_
best_score = grid.best_score_
best_rf = grid.best_estimator_
print(best_params)
print(best_score)
print(best_rf)

In [None]:
scores = cross_val_score(best_rf, train_rf_x_engine, log_transform_y, cv=6)
accuracy_rf = scores.mean()
print(scores)
print(accuracy_rf)

In [None]:
best_rf.score(train_rf_x_engine,log_transform_y)

In [None]:
pred = best_rf.predict(test_rf_x_engine)

In [None]:
pred = np.exp(pred)

In [None]:
## Other submission style
## Creating a Submission File to submit to Kaggle competition ##
testData = pd.read_csv("test.csv")
submission = pd.DataFrame({
        "Id": testData["Id"],
        "Prediction": pred
    })
submission.to_csv('RandomForestSimple_log_transform_feature_engine_8th_trial.csv',header=True, index=False)