In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix 
from sklearn import metrics
%matplotlib inline

In [31]:
df_train = pd.read_csv('competitive-data-science-predict-future-sales/sales_train.csv')
df_train.head(3)

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0


In [32]:
df_shops = pd.read_csv('competitive-data-science-predict-future-sales/shops.csv')
df_shops.head(3)

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2


In [33]:
df_items = pd.read_csv('competitive-data-science-predict-future-sales/items.csv')
df_items.head(3)

Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40


In [34]:
df_item_categories = pd.read_csv('competitive-data-science-predict-future-sales/item_categories.csv')
df_item_categories.head(3)

Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2


In [35]:
df_sample_submission = pd.read_csv('competitive-data-science-predict-future-sales/sample_submission.csv')
df_sample_submission.head(3)

Unnamed: 0,ID,item_cnt_month
0,0,0.5
1,1,0.5
2,2,0.5


In [36]:
df_test = pd.read_csv('competitive-data-science-predict-future-sales/test.csv')
df_test.head(3)

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233


In [37]:
df_train.describe()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day
count,2935849.0,2935849.0,2935849.0,2935849.0,2935849.0
mean,14.56991,33.00173,10197.23,890.8532,1.242641
std,9.422988,16.22697,6324.297,1729.8,2.618834
min,0.0,0.0,0.0,-1.0,-22.0
25%,7.0,22.0,4476.0,249.0,1.0
50%,14.0,31.0,9343.0,399.0,1.0
75%,23.0,47.0,15684.0,999.0,1.0
max,33.0,59.0,22169.0,307980.0,2169.0


In [38]:
df_train.isnull().sum()

date              0
date_block_num    0
shop_id           0
item_id           0
item_price        0
item_cnt_day      0
dtype: int64

In [39]:
df_train['date'] = pd.to_datetime(df_train['date'],format = '%d.%m.%Y')
df_train.head(3)

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2013-01-02,0,59,22154,999.0,1.0
1,2013-01-03,0,25,2552,899.0,1.0
2,2013-01-05,0,25,2552,899.0,-1.0


In [40]:
df_train['date'] = df_train['date'].apply(lambda x: x.strftime('%Y-%m'))

In [41]:
df_train.head(3)

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2013-01,0,59,22154,999.0,1.0
1,2013-01,0,25,2552,899.0,1.0
2,2013-01,0,25,2552,899.0,-1.0


In [42]:
df = df_train.groupby(['date','shop_id','item_id']).sum()

In [43]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,date_block_num,item_price,item_cnt_day
date,shop_id,item_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-01,0,32,0,884.0,6.0
2013-01,0,33,0,1041.0,3.0
2013-01,0,35,0,247.0,1.0
2013-01,0,43,0,221.0,1.0
2013-01,0,51,0,257.0,2.0
...,...,...,...,...,...
2015-10,59,22087,99,357.0,6.0
2015-10,59,22088,66,238.0,2.0
2015-10,59,22091,33,179.0,1.0
2015-10,59,22100,33,629.0,1.0


In [44]:
df = df.pivot_table(index=['shop_id','item_id'], columns='date', values='item_cnt_day', fill_value=0)
df.reset_index(inplace=True)
df.head()

date,shop_id,item_id,2013-01,2013-02,2013-03,2013-04,2013-05,2013-06,2013-07,2013-08,...,2015-01,2015-02,2015-03,2015-04,2015-05,2015-06,2015-07,2015-08,2015-09,2015-10
0,0,30,0,31,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,31,0,11,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,32,6,10,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,33,3,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,35,1,14,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
df_test = pd.merge(df_test, df, on=['shop_id','item_id'], how='left')
df_test.head(3)

Unnamed: 0,ID,shop_id,item_id,2013-01,2013-02,2013-03,2013-04,2013-05,2013-06,2013-07,...,2015-01,2015-02,2015-03,2015-04,2015-05,2015-06,2015-07,2015-08,2015-09,2015-10
0,0,5,5037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,1.0,1.0,1.0,3.0,1.0,0.0
1,1,5,5320,,,,,,,,...,,,,,,,,,,
2,2,5,5233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,2.0,0.0,1.0,3.0,1.0


In [46]:
df_test = df_test.fillna(0)

In [47]:
df_test.drop(['ID','2013-01'], axis=1, inplace=True)

In [48]:
df_test.head(3)

Unnamed: 0,shop_id,item_id,2013-02,2013-03,2013-04,2013-05,2013-06,2013-07,2013-08,2013-09,...,2015-01,2015-02,2015-03,2015-04,2015-05,2015-06,2015-07,2015-08,2015-09,2015-10
0,5,5037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,1.0,1.0,1.0,3.0,1.0,0.0
1,5,5320,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5,5233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,2.0,0.0,1.0,3.0,1.0


In [49]:
X = df.drop(['2015-10'], axis=1)
X

date,shop_id,item_id,2013-01,2013-02,2013-03,2013-04,2013-05,2013-06,2013-07,2013-08,...,2014-12,2015-01,2015-02,2015-03,2015-04,2015-05,2015-06,2015-07,2015-08,2015-09
0,0,30,0,31,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,31,0,11,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,32,6,10,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,33,3,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,35,1,14,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
424119,59,22154,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
424120,59,22155,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
424121,59,22162,0,0,0,0,0,0,0,0,...,0,0,9,4,1,1,0,0,1,0
424122,59,22164,0,0,0,0,0,0,0,0,...,0,0,2,1,2,0,0,1,0,0


In [50]:
y = df['2015-10']
y

0         0
1         0
2         0
3         0
4         0
         ..
424119    0
424120    0
424121    0
424122    0
424123    0
Name: 2015-10, Length: 424124, dtype: int64

In [51]:
test = df_test

In [52]:
print(X.shape)
print(test.shape)
print(y.shape)

(424124, 35)
(214200, 35)
(424124,)


Applying Linear Regression

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [54]:
regressor = LinearRegression()  
regressor.fit(X_train, y_train)

LinearRegression()

In [55]:
y_pred = regressor.predict(X_test)
y_pred

array([ 0.38234273, -0.15259537,  1.22375028, ..., -0.16670562,
        0.38967317,  2.02367528])

In [56]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 0.5970513255964707
Mean Squared Error: 20.663245381333123
Root Mean Squared Error: 4.545684258869408


In [57]:
y_pred_test = regressor.predict(test)
y_pred_test

array([ 2.55454759, -0.05839896,  2.02984539, ..., -0.20855032,
       -0.00980428,  0.2923379 ])

In [58]:
RFR = RandomForestRegressor(n_estimators = 100)
RFR.fit(X_train,y_train)

RandomForestRegressor()

In [59]:
y_predict = RFR.predict(X_test)
y_predict

array([0.  , 0.44, 0.15, ..., 0.  , 0.  , 0.02])

In [60]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_predict))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_predict))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_predict)))

Mean Absolute Error: 0.1723731211317418
Mean Squared Error: 4.5437074459180655
Root Mean Squared Error: 2.1315973930172802


In [61]:
y_predict_test = RFR.predict(test)
y_predict_test

array([0.11, 0.98, 0.65, ..., 0.03, 0.93, 0.03])

In [62]:
y_predict_test = list(map(round, y_predict_test))
y_predict_test

[0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 2,
 0,
 0,
 0,
 2,
 2,
 1,
 0,
 2,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 4,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 3,
 5,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 7,
 2,
 2,
 2,
 5,
 10,
 6,
 0,
 0,
 0,
 2,
 1,
 0,
 0,
 0,
 0,
 2,
 2,
 1,
 1,
 2,
 2,
 0,
 2,
 0,
 1,
 1,
 2,
 1,
 0,
 1,
 1,
 1,
 0,
 3,
 1,
 1,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 4,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 8,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 5,
 0,
 0,
 1,
 1,
 1,
 2,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 2,
 7,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 3,
 3,
 1,
 0,
 0,
 1,
 1,
 2,
 0,
 0,
 2,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 3,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 1,
 0,
 1,
 1,
 1,
 3,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 3,
 1,
 1,
 0,
 1,
 1,
 0,
 0,

In [63]:
df_sample_submission['item_cnt_month'] = y_predict_test
df_sample_submission.to_csv('prediction.csv', index=False)
df_sample_submission.head()

Unnamed: 0,ID,item_cnt_month
0,0,0
1,1,1
2,2,1
3,3,0
4,4,1
