In [86]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from IPython.display import display

In [87]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import  LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import math

In [88]:
df = pd.read_csv('sales_train.csv')
items = pd.read_csv('items.csv')
item_cats = pd.read_csv('item_categories.csv')
shops = pd.read_csv('shops.csv')

In [89]:
df.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [90]:
df.describe(exclude = None)

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day
count,2935849.0,2935849.0,2935849.0,2935849.0,2935849.0
mean,14.56991,33.00173,10197.23,890.8532,1.242641
std,9.422988,16.22697,6324.297,1729.8,2.618834
min,0.0,0.0,0.0,-1.0,-22.0
25%,7.0,22.0,4476.0,249.0,1.0
50%,14.0,31.0,9343.0,399.0,1.0
75%,23.0,47.0,15684.0,999.0,1.0
max,33.0,59.0,22169.0,307980.0,2169.0


In [91]:
df.isnull().sum()

date              0
date_block_num    0
shop_id           0
item_id           0
item_price        0
item_cnt_day      0
dtype: int64

In [92]:
df.shape

(2935849, 6)

In [93]:
df['date'] = pd.to_datetime(df.date, format = '%d.%m.%Y')

In [94]:
df.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2013-01-02,0,59,22154,999.0,1.0
1,2013-01-03,0,25,2552,899.0,1.0
2,2013-01-05,0,25,2552,899.0,-1.0
3,2013-01-06,0,25,2554,1709.05,1.0
4,2013-01-15,0,25,2555,1099.0,1.0


In [95]:
df1 = df.copy()
df1.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2013-01-02,0,59,22154,999.0,1.0
1,2013-01-03,0,25,2552,899.0,1.0
2,2013-01-05,0,25,2552,899.0,-1.0
3,2013-01-06,0,25,2554,1709.05,1.0
4,2013-01-15,0,25,2555,1099.0,1.0


In [96]:
df1['upload_month'] = df1.date.dt.month
del df1['date']
df1.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,upload_month
0,0,59,22154,999.0,1.0,1
1,0,25,2552,899.0,1.0,1
2,0,25,2552,899.0,-1.0,1
3,0,25,2554,1709.05,1.0,1
4,0,25,2555,1099.0,1.0,1


In [97]:
X = df1.drop('item_cnt_day', axis = 1)
y = df1['item_cnt_day']
display(X.head())
display(y.head())

Unnamed: 0,date_block_num,shop_id,item_id,item_price,upload_month
0,0,59,22154,999.0,1
1,0,25,2552,899.0,1
2,0,25,2552,899.0,1
3,0,25,2554,1709.05,1
4,0,25,2555,1099.0,1


0    1.0
1    1.0
2   -1.0
3    1.0
4    1.0
Name: item_cnt_day, dtype: float64

In [98]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.20, random_state = 1)

In [99]:
reg_methods = [LinearRegression(), RandomForestRegressor(random_state = 1)]
names_reg = ['Linear Regression','Random Forest']

In [100]:
def get_fits(reg_methods):
    fits = []
    for method in reg_methods:
        random.seed(1)
        reg = method
        reg.fit(X_train,y_train)
        fits.append(reg)
    return fits

fits = get_fits(reg_methods)

In [101]:
def get_scores(fits, names_reg, X_data, y_data):
    reg_reports = []
    for reg in fits:
        y_pred = reg.predict(X_data)
        mse = mean_squared_error(y_data, y_pred)
        reg_reports.append(mse)
        
    index = names_reg
    scores = pd.DataFrame(reg_reports, index = index, columns = ['MSE'])
    return scores
    
scores = get_scores(fits,names_reg, X_test, y_test)
scores

Unnamed: 0,MSE
Linear Regression,12.699874
Random Forest,11.430137


In [102]:
#Note: try to analyze seasonality, when sales were higher, etc.