In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error,mean_absolute_error
from statsmodels.tools.eval_measures import rmse
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
pd.set_option('display.max_row', 100)
pd.set_option('display.max_column', 150)

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
Submission = pd.read_csv("sampleSubmission.csv")

In [3]:
train_x = train.drop("revenue",axis=1)
train_y = pd.DataFrame(train["revenue"])

In [4]:
print(train_x.shape)
print(test.shape)

(137, 42)
(100000, 42)


In [5]:
data = pd.concat([train_x,test])
print(data.shape)
print(data.isnull().sum().sum())

(100137, 42)
0


In [6]:
def create_dummies(df,column_name):
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

In [7]:
def feature_engineer(df):
    df = df.astype(float,errors="ignore")
    df["Open Date"] = pd.to_datetime(df["Open Date"], format='%m/%d/%Y')
    df = df.set_index("Open Date")
    df["Open_Year"] = df.index.year
    df["Open_Month"] = df.index.month
    df = df.reset_index(drop=True)
    df = create_dummies(df,"Type")
    df = create_dummies(df,"City Group")
    df = create_dummies(df,"City")
    df = create_dummies(df,"Open_Year")
    df = create_dummies(df,"Open_Month")
    df = df.drop(["City","City Group","Type","Open_Year","Open_Month"],axis=1)
    return df

In [8]:
data_rf = feature_engineer(data)

In [9]:
train_rf_x = data_rf[:137]
test_x = data_rf[137:]
print(train_rf_x.shape)
print(test_x.shape)

(137, 139)
(100000, 139)


In [10]:
train_rf_x = train_rf_x.drop("Id",axis=1)
Submission_id = pd.DataFrame(test_x["Id"]).astype(int).reset_index(drop=True)
test_rf_x = test_x.drop("Id",axis=1).reset_index(drop=True)

In [11]:
print(train_rf_x.shape) # Perform features normalization
print(train_y.shape)
print(test_rf_x.shape) # Perform features normalization
print(Submission_id.shape)

(137, 138)
(137, 1)
(100000, 138)
(100000, 1)


In [12]:
# Transform features only, and need no inverse_transform back
from sklearn.preprocessing import StandardScaler
train_score_scaler = StandardScaler()
test_score_scaler = StandardScaler()
normalize_train = pd.DataFrame(train_score_scaler.fit_transform(train_rf_x.loc[:,"P1":"P37"]),columns = train_rf_x.loc[:,"P1":"P37"].columns)
normalize_test = pd.DataFrame(test_score_scaler.fit_transform(test_rf_x.loc[:,"P1":"P37"]),columns = test_rf_x.loc[:,"P1":"P37"].columns)

In [13]:
# transformation for Train_y
train_y_scaler = StandardScaler()
normalize_train_y = pd.DataFrame(train_y_scaler.fit_transform(train_y),columns = train_y.columns)
log_transform_y = train_y.apply(np.log)

In [14]:
train_rf_x.loc[:,"P1":"P37"] = normalize_train
test_rf_x.loc[:,"P1":"P37"] = normalize_test
print(test_rf_x.isnull().sum().sum())
print(train_rf_x.loc[:,"P1":"P37"].equals(normalize_train))

0
True


# Feature Engineering Part_2: Create N-way interaction features

In [15]:
from sklearn.preprocessing import PolynomialFeatures

In [16]:
# The interaction features can be as 2-way interaction, 3 way or more by adjusting the polynomial degree
n_way_interactions = PolynomialFeatures(2, interaction_only=False, include_bias=False)

In [17]:
n_way_interactions_columns = ['P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'P10', 'P11',
       'P12', 'P13', 'P14', 'P15', 'P16', 'P17', 'P18', 'P19', 'P20', 'P21',
       'P22', 'P23', 'P24', 'P25', 'P26', 'P27', 'P28', 'P29', 'P30', 'P31',
       'P32', 'P33', 'P34', 'P35', 'P36', 'P37']

In [18]:
train_interactions = pd.DataFrame(n_way_interactions.fit_transform(train_rf_x[n_way_interactions_columns]),columns = n_way_interactions.get_feature_names(train_rf_x[n_way_interactions_columns].columns))
test_interactions = pd.DataFrame(n_way_interactions.fit_transform(test_rf_x[n_way_interactions_columns]),columns = n_way_interactions.get_feature_names(test_rf_x[n_way_interactions_columns].columns))

In [19]:
train_rf_x_interaction = pd.concat([train_rf_x,train_interactions],axis=1).drop(n_way_interactions_columns,axis=1)
test_rf_x_interaction = pd.concat([test_rf_x,test_interactions],axis=1).drop(n_way_interactions_columns,axis=1)
print(train_rf_x_interaction.shape)
print(test_rf_x_interaction.shape)

(137, 804)
(100000, 804)


In [20]:
# Train_test_split
Train_X, Test_X, Train_Y, Test_Y = train_test_split(train_rf_x_interaction, log_transform_y, test_size=0.20,random_state = 1)

# Linear Regression

In [22]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [23]:
lr.fit(Train_X, Train_Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

# Train-Test split, RMSE comparison to identify variance problem

In [24]:
Train_Y_predictions = lr.predict(Train_X)
Test_Y_predictions = lr.predict(Test_X)

In [26]:
train_rmse = np.sqrt(mean_squared_error(Train_Y,Train_Y_predictions))
test_rmse = np.sqrt(mean_squared_error(Test_Y,Test_Y_predictions))
train_rmse = np.exp(train_rmse)
test_rmse = np.exp(test_rmse)
print(train_rmse)
print(test_rmse)

1.0000000000000042
4.5322097822390015


# Linear Regression Full Model

In [28]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [29]:
lr.fit(train_rf_x_interaction, log_transform_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [30]:
Train_Y_predictions = lr.predict(train_rf_x_interaction)

In [32]:
train_rmse = np.sqrt(mean_squared_error(log_transform_y,Train_Y_predictions))
train_rmse = np.exp(train_rmse)
print(train_rmse)

1.0000000000000107


In [51]:
Train_Y_predictions = lr.predict(test_rf_x_interaction)
pred = pd.DataFrame(np.exp(Train_Y_predictions),columns = ["pred"])

In [53]:
## Other submission style
## Creating a Submission File to submit to Kaggle competition ##
testData = pd.read_csv("test.csv")
submission = pd.DataFrame({
        "Id": testData["Id"],
        "Prediction": pred["pred"]
    })
submission.to_csv('Linear Regression Model with Polynomial Features_11th_trial.csv',header=True, index=False)