### Get data

In [1]:
import pandas as pd
df = pd.read_csv('../input/rossmann-store-sales/train.csv')
df.head(4)

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1


In [2]:
df.shape

(1017209, 9)

### Feature Engineering

In [3]:
df['IsWeekEnd'] = df['DayOfWeek'] > 5

In [4]:
# split date column into new columns
df[['Year', 'Month', 'Day']] = (df['Date'].str.split('-', expand=True))
df.drop('Date', axis=1, inplace=True)

In [5]:
# drop the Customers column
df.drop('Customers', axis=1, inplace=True)

In [6]:
df.head(4)

Unnamed: 0,Store,DayOfWeek,Sales,Open,Promo,StateHoliday,SchoolHoliday,IsWeekEnd,Year,Month,Day
0,1,5,5263,1,1,0,1,False,2015,7,31
1,2,5,6064,1,1,0,1,False,2015,7,31
2,3,5,8314,1,1,0,1,False,2015,7,31
3,4,5,13995,1,1,0,1,False,2015,7,31


In [7]:
df['StateHoliday'].unique()

array(['0', 'a', 'b', 'c', 0], dtype=object)

In [8]:
# Convert categorical Variables to numerical
df['StateHoliday'] = df['StateHoliday'].map({0:0,'0':0,'a':1,'b':2,'c':3})
df['StateHoliday'].unique()

array([0, 1, 2, 3])

### Get Independent and Dependent Variables

In [10]:
X = df.drop('Sales', axis=1)
y = df['Sales']

#### Get Data for Validation

In [11]:
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0)

### Model Building

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import LinearSVR
from sklearn.naive_bayes import BernoulliNB

model = RandomForestRegressor()

### Model Evaluation

In [13]:
# train the model
model.fit(train_X, train_y)

RandomForestRegressor()

In [14]:
# make predictions on validation data
y_pred = model.predict(val_X)

In [15]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

def get_metrics(y_pred, y_val):
    r2 = r2_score(y_pred, y_val)
    mae = mean_absolute_error(y_pred, y_val)
    mse = mean_squared_error(y_pred, y_val)
    return r2, mae, mse

In [16]:
# evaluate results
import numpy as np
r2, mae, mse = get_metrics(y_pred, val_y)

print('R-Squared: %.2f%%'%(r2*100))
print('Mean Absolute Error: %.4f'%mae)
print('Mean Squared Error: %.4f'%mse)
print('Root Mean Squared Error: %.4f'%np.sqrt(mse))

R-Squared: 82.19%
Mean Absolute Error: 825.8234
Mean Squared Error: 2216580.5145
Root Mean Squared Error: 1488.8185


In [17]:
print('Mean Sales Value:', train_y.mean())

Mean Sales Value: 5773.136521406307
