In [23]:
import pandas as pd
import numpy as np
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
import matplotlib.pyplot as plt
import seaborn as sns
import cufflinks
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)
from sklearn.metrics import mean_squared_error,mean_absolute_error
from statsmodels.tools.eval_measures import rmse
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
pd.set_option('display.max_row', 100)
pd.set_option('display.max_column', 150)

import warnings
warnings.filterwarnings('ignore')

In [24]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
Submission = pd.read_csv("sampleSubmission.csv")

In [25]:
train_x = train.drop("revenue",axis=1)
train_y = pd.DataFrame(train["revenue"])

In [26]:
print(train_x.shape)
print(test.shape)

(137, 42)
(100000, 42)


In [27]:
data = pd.concat([train_x,test])
print(data.shape)
print(data.isnull().sum().sum())

(100137, 42)
0


In [28]:
def create_dummies(df,column_name):
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

In [29]:
def feature_engineer(df):
    df = df.astype(float,errors="ignore")
    df["Open Date"] = pd.to_datetime(df["Open Date"], format='%m/%d/%Y')
    df = df.set_index("Open Date")
    df["Open_Year"] = df.index.year
    df["Open_Month"] = df.index.month
    df = df.reset_index(drop=True)
    df = create_dummies(df,"Type")
    df = create_dummies(df,"City Group")
    df = create_dummies(df,"City")
    df = create_dummies(df,"Open_Year")
    df = create_dummies(df,"Open_Month")
    df = df.drop(["City","City Group","Type","Open_Year","Open_Month"],axis=1)
    return df

In [30]:
data_rf = feature_engineer(data)

In [31]:
train_rf_x = data_rf[:137]
test_x = data_rf[137:]
print(train_rf_x.shape)
print(test_x.shape)

(137, 139)
(100000, 139)


In [32]:
train_rf_x = train_rf_x.drop("Id",axis=1)
Submission_id = pd.DataFrame(test_x["Id"]).astype(int).reset_index(drop=True)
test_rf_x = test_x.drop("Id",axis=1).reset_index(drop=True)

In [33]:
print(train_rf_x.shape) # Perform features normalization
print(train_y.shape)
print(test_rf_x.shape) # Perform features normalization
print(Submission_id.shape)

(137, 138)
(137, 1)
(100000, 138)
(100000, 1)


In [34]:
# Transform features only, and need no inverse_transform back
from sklearn.preprocessing import StandardScaler
train_score_scaler = StandardScaler()
test_score_scaler = StandardScaler()
normalize_train = pd.DataFrame(train_score_scaler.fit_transform(train_rf_x.loc[:,"P1":"P37"]),columns = train_rf_x.loc[:,"P1":"P37"].columns)
normalize_test = pd.DataFrame(test_score_scaler.fit_transform(test_rf_x.loc[:,"P1":"P37"]),columns = test_rf_x.loc[:,"P1":"P37"].columns)
# z- score transformation for train_rf_x & test_rf_x
train_rf_x.loc[:,"P1":"P37"] = normalize_train
test_rf_x.loc[:,"P1":"P37"] = normalize_test
print(test_rf_x.isnull().sum().sum())
print(train_rf_x.loc[:,"P1":"P37"].equals(normalize_train))

0
True


In [35]:
# transformation for Train_y
train_y_scaler = StandardScaler()
normalize_train_y = pd.DataFrame(train_y_scaler.fit_transform(train_y),columns = train_y.columns)
log_transform_y = train_y.apply(np.log)

# Feature Engineering: Create N-way interaction features

In [36]:
from sklearn.preprocessing import PolynomialFeatures

In [37]:
# The interaction features can be as 2-way interaction, 3 way or more by adjusting the polynomial degree
n_way_interactions = PolynomialFeatures(2, interaction_only=True, include_bias=False)

In [38]:
n_way_interactions_columns = ['P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'P10', 'P11',
       'P12', 'P13', 'P14', 'P15', 'P16', 'P17', 'P18', 'P19', 'P20', 'P21',
       'P22', 'P23', 'P24', 'P25', 'P26', 'P27', 'P28', 'P29', 'P30', 'P31',
       'P32', 'P33', 'P34', 'P35', 'P36', 'P37']

In [39]:
train_interactions = pd.DataFrame(n_way_interactions.fit_transform(train_rf_x[n_way_interactions_columns]),columns = n_way_interactions.get_feature_names(train_rf_x[n_way_interactions_columns].columns))
test_interactions = pd.DataFrame(n_way_interactions.fit_transform(test_rf_x[n_way_interactions_columns]),columns = n_way_interactions.get_feature_names(test_rf_x[n_way_interactions_columns].columns))

In [40]:
train_rf_x_interaction = pd.concat([train_rf_x,train_interactions],axis=1).drop(n_way_interactions_columns,axis=1)
test_rf_x_interaction = pd.concat([test_rf_x,test_interactions],axis=1).drop(n_way_interactions_columns,axis=1)
print(train_rf_x_interaction.shape)
print(test_rf_x_interaction.shape)

(137, 767)
(100000, 767)


# Train_Test_Split

In [41]:
# Train_test_split
Train_X, Test_X, Train_Y, Test_Y = train_test_split(train_rf_x, log_transform_y, test_size=0.20,random_state = 1)

# Random Forest Regressor, Grid Search with log_transformation_y

In [42]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [43]:
hyperparameters = {"criterion": ["mse"],
                   "max_depth": [None],   
                   "max_features": ["auto","log2", "sqrt",0.5], 
                   "min_samples_split": [2,4],
                   "n_estimators": [300,400,500,600,700,800,900,1000],
                   "oob_score": [True,False],
                  }

In [44]:
hyperparameters2 = {"criterion": ["mse"],
                   "max_depth": [None],   
                   "max_features": ["auto","log2", "sqrt"],
                   "min_samples_split": [2,4], 
                   "n_estimators": [30,60,80,100,200,300,400]
                  }

In [45]:
cls = RandomForestRegressor()
grid = GridSearchCV(cls,param_grid=hyperparameters2,cv=6)
grid.fit(train_rf_x_interaction, log_transform_y)

GridSearchCV(cv=6, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'criterion': ['mse'], 'max_depth': [None], 'max_features': ['auto', 'log2', 'sqrt'], 'min_samples_split': [2, 4], 'n_estimators': [30, 60, 80, 100, 200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [46]:
best_params = grid.best_params_
best_score = grid.best_score_
best_rf = grid.best_estimator_
print(best_params)
print(best_score)
print(best_rf)

{'criterion': 'mse', 'max_depth': None, 'max_features': 'log2', 'min_samples_split': 4, 'n_estimators': 80}
0.14906281208574632
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='log2', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=4,
           min_weight_fraction_leaf=0.0, n_estimators=80, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)


In [47]:
scores = cross_val_score(best_rf, train_rf_x_interaction, log_transform_y, cv=6)
accuracy_rf = scores.mean()
print(scores)
print(accuracy_rf)

[ 0.16951943  0.03904301  0.26035376  0.15365182  0.32043636 -0.0698795 ]
0.14552081155375965


In [48]:
best_rf.score(train_rf_x_interaction,log_transform_y)

0.8109682374262545

In [49]:
pred = best_rf.predict(test_rf_x_interaction)

In [50]:
pred = np.exp(pred)

In [51]:
## Other submission style
## Creating a Submission File to submit to Kaggle competition ##
testData = pd.read_csv("test.csv")
submission = pd.DataFrame({
        "Id": testData["Id"],
        "Prediction": pred
    })
submission.to_csv('RandomForestSimple_log_interaction_features_9th_trial.csv',header=True, index=False)

# Random Forest Regressor, Grid Search with z-score Normalization_y

In [52]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [53]:
hyperparameters = {"criterion": ["mse"],
                   "max_depth": [None],   # Use for regularization, prevent overfitting
                   "max_features": ["auto","log2", "sqrt",0.5], # Use for regularization, prevent overfitting
                   "min_samples_split": [2,4], # Use for regularization, prevent overfitting
                   "n_estimators": [300,400,500,600,700,800,900,1000],
                   "oob_score": [True,False],
                  }

In [54]:
hyperparameters2 = {"criterion": ["mse"],
                   "max_depth": [None],   # Use for regularization, prevent overfitting
                   "max_features": ["auto","log2", "sqrt"], # Use for regularization, prevent overfitting
                   "min_samples_split": [2,4], # Use for regularization, prevent overfitting
                   "n_estimators": [30,60,80,100,200,300,400] # Many ensemble trees reduces overfitting
                  }

In [55]:
cls = RandomForestRegressor()
grid = GridSearchCV(cls,param_grid=hyperparameters2,cv=6)
grid.fit(train_rf_x_interaction, normalize_train_y)

GridSearchCV(cv=6, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'criterion': ['mse'], 'max_depth': [None], 'max_features': ['auto', 'log2', 'sqrt'], 'min_samples_split': [2, 4], 'n_estimators': [30, 60, 80, 100, 200, 300, 400]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [56]:
best_params = grid.best_params_
best_score = grid.best_score_
best_rf = grid.best_estimator_
print(best_params)
print(best_score)
print(best_rf)

{'criterion': 'mse', 'max_depth': None, 'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 30}
0.032854049477246246
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)


In [57]:
scores = cross_val_score(best_rf, train_rf_x_interaction, normalize_train_y, cv=6)
accuracy_rf = scores.mean()
print(scores)
print(accuracy_rf)

[ 0.06955238 -0.13615583  0.26602754 -0.20222923  0.08959258 -0.32799559]
-0.0402013583190666


In [58]:
best_rf.score(train_rf_x_interaction,normalize_train_y)

0.8373624300986184

In [59]:
pred = best_rf.predict(test_rf_x_interaction)

In [60]:
pred = train_y_scaler.inverse_transform(pred)

In [61]:
## Other submission style
## Creating a Submission File to submit to Kaggle competition ##
testData = pd.read_csv("test.csv")
submission = pd.DataFrame({
        "Id": testData["Id"],
        "Prediction": pred
    })
submission.to_csv('RandomForestSimple_z-score_interaction_features_10th_trial.csv',header=True, index=False)