In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.stattools import adfuller

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
sales_data = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
item_cat = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')
items = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
shops = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')
sample_submission = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv')
test_data = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv')


In [3]:
sales_data.head(20)

In [4]:
print(sales_data.shape)

In [5]:
sales_data.tail(20)

In [6]:
sales_data.dtypes

**From above Table it can be said that data is from 2013 to 2015.**

In [7]:
sales_data["date"]= pd.to_datetime(sales_data["date"], format='%d.%m.%Y')
sales_data.sort_values(by="date", ascending=True, inplace=True)
print(sales_data)

In [8]:
sales_data.describe()

In [9]:
sales_data.item_cnt_day.unique()

In [10]:
sales_data.item_cnt_day.max()

In [11]:
sales_data.item_cnt_day.min()

In [12]:
sales_data["Year"] = sales_data["date"].dt.year
sales_data["Month"] = sales_data["date"].dt.month

In [13]:
sales_data['Month']=[d.strftime('%b') for d in sales_data.date]
sales_data

In [14]:
sales_data['Sales_per_item'] = sales_data['item_cnt_day'] * sales_data['item_price']
sales_data

In [15]:
sales_data['item_category_id']=item_cat['item_category_id']
sales_data

In [16]:
fig,axes = plt.subplots(1,1,figsize=(7,7))
sns.lineplot(x=sales_data['Year'],y=sales_data['item_cnt_day'])
plt.show()

In [17]:
fig,axes = plt.subplots(1,1,figsize=(7,7))
sns.lineplot(x=sales_data['Month'],y=sales_data['item_cnt_day'])
plt.show()

In [18]:
fig,axes = plt.subplots(1,1,figsize=(7,7))
sns.lineplot(x=sales_data['Year'],y=sales_data['Sales_per_item'])
plt.show()

# How much item is sold by each shop


In [19]:
shop_sum=sales_data.groupby(['shop_id'], as_index=False)['item_cnt_day'].sum()
fig, axes = plt.subplots(1,1,figsize = (20, 8))
sns.barplot(x="shop_id",y="item_cnt_day", data=shop_sum)
plt.show()

# Category-wise selling

In [20]:
Category_sum=sales_data.groupby(['item_category_id'], as_index=False)['item_cnt_day'].sum()
fig, axes = plt.subplots(1,1,figsize = (35,8))
sns.barplot(x="item_category_id",y="item_cnt_day", data=Category_sum)
plt.show()

In [21]:
Category_sum=sales_data.groupby(['item_category_id'], as_index=False)['Sales_per_item'].sum()
fig, axes = plt.subplots(1,1,figsize = (35,8))
sns.barplot(x="item_category_id",y="Sales_per_item", data=Category_sum)
plt.show()

In [22]:
sales_data_tmp=sales_data[['date_block_num','shop_id','item_cnt_day']]
dt = pd.pivot_table(index='date_block_num',data=sales_data_tmp,columns='shop_id', aggfunc='sum')
dt = dt.item_cnt_day
dt.columns.name = 'No. of Shops'
dt.index.name='No. of Months'
dt.fillna(0, inplace=True)
dt

# The Above table shows the monthly sales of different shops.

In [23]:
fig,axes = plt.subplots(1,figsize=(24,9))
sales_data_tmp = sales_data[['Year','Month','item_cnt_day']].pivot_table(index=['Month'],columns=['Year'],aggfunc={"item_cnt_day":np.sum})

axes.plot(sales_data_tmp)
axes.set_title('Total no of units sold')
axes.legend(labels=[i[1] for i in sales_data_tmp.columns])
plt.suptitle('Monthly Sales',fontsize="28")
plt.show()



In [24]:
fig,axes = plt.subplots(1,figsize=(24,9))

sales_data_tmp = sales_data[['Year','Month','Sales_per_item']].pivot_table(index=['Month'],columns=['Year'],aggfunc={"Sales_per_item":np.sum})
axes.plot(sales_data_tmp)
axes.set_title('Total no of units sold')
axes.legend(labels=[i[1] for i in sales_data_tmp.columns])
plt.suptitle('Monthly Sales',fontsize="28")
plt.show()

**Various ways are there to check the stationarity of data:-
1)Through Visualization
2)Dickey Fuller Test
3) Constant Mean and Variance**

In [25]:
s=pd.Series(sales_data.item_cnt_day)
s1=s[0:70000]
s2=s[70001:140000]
s3=s[140001:200000]
print(s3)

In [26]:
ad_test=adfuller(s3)

In [27]:
ad_test[0]

In [28]:
ad_test[1]

In [29]:
x=sales_data['item_cnt_day']
print(x)

In [30]:
item_cnt_day_detrend=x.diff()

In [31]:
plt.plot(item_cnt_day_detrend)

In [32]:
sales_data["item_cnt_day_detrend"]=item_cnt_day_detrend
sales_data

In [33]:
from statsmodels.tsa.seasonal import seasonal_decompose

In [34]:

sales_data_tmp=sales_data[['date','item_cnt_day']]
result_mul = seasonal_decompose(sales_data_tmp['item_cnt_day'],period=8 ,model='additive',extrapolate_trend='freq')

deseasonalized = sales_data.item_cnt_day.values / result_mul.seasonal

plt.plot(deseasonalized)

plt.title('Deseasonalized', fontsize=16)
plt.plot()

# Outlier Detection by IQR

In [35]:
sales_data.describe()

In [36]:
y=sorted(sales_data['Sales_per_item'])

In [37]:
q1,q3=np.percentile(sales_data['Sales_per_item'],[25,75])
print(q1)
print(q3)

In [38]:
IQR=q3-q1
print(IQR)

In [39]:
low_value=q1-(1.5*IQR)
high_value=q3+(1.5*IQR)

print(low_value, high_value)

In [40]:
sales_data[(sales_data.Sales_per_item<low_value)|(sales_data.Sales_per_item>high_value)]

# Outlier Removal

In [41]:
New_sales_data=sales_data[(sales_data.Sales_per_item>low_value)&(sales_data.Sales_per_item<high_value)]

In [42]:
New_sales_data

In [43]:
print("Old_data:",sales_data.shape)
print("New_data:",New_sales_data.shape)

In [44]:
New_sales_data.drop(['item_category_id'], axis=1)

In [45]:
lag_list = [1, 2, 3]

for lag in lag_list:
    ft_name = ('item_cnt_shifted%s' % lag)
    New_sales_data[ft_name] = New_sales_data.sort_values('date_block_num').groupby(['shop_id', 'item_category_id', 'item_id'])['item_cnt_day'].shift(lag)

    New_sales_data[ft_name].fillna(0, inplace=True)
    New_sales_data['item_category_id'].fillna(0, inplace=True)
    
New_sales_data

In [46]:
New_sales_data=New_sales_data.rename(columns={'item_cnt_day':'item_cnt_month'})
New_sales_data

In [47]:
New_sales_data.head().T

In [48]:
train_df=New_sales_data[['shop_id','item_id','date_block_num','Sales_per_item','item_cnt_month','item_cnt_shifted3','item_cnt_shifted2']]
train_df

In [49]:
dataset=pd.merge(test_data,train_df, on=['shop_id','item_id'], how='left')
print(dataset)
test_dataset=dataset.drop(['ID','item_cnt_month','Sales_per_item','item_cnt_shifted3','item_cnt_shifted2'], axis=1).fillna(34)
test_dataset[test_dataset['date_block_num']==34]

co1=dataset["Sales_per_item"]
co2=dataset["item_cnt_shifted3"]
co3=dataset["item_cnt_shifted2"]
co4=dataset['ID']


test_dataset=test_dataset.join(co1)
test_dataset=test_dataset.join(co2)
test_dataset=test_dataset.join(co3)
test_dataset=test_dataset.join(co4)


for shop_id in test_dataset['shop_id'].unique(): 
    for column in test_dataset.columns: 
        shop_median = test_dataset[(test_dataset['shop_id'] == shop_id)][column].median()
        test_dataset.loc[(test_dataset[column].isnull()) & (test_dataset['shop_id'] == shop_id), column] = shop_median
        
unseen_data=test_dataset[test_dataset['date_block_num']==34]

In [50]:
unseen_data

In [51]:
test_data=unseen_data.drop(['ID'], axis=1)
test_data

In [52]:
X = train_df.drop('item_cnt_month',axis='columns')
y = train_df.item_cnt_month


# Modelling

# Random Forest

In [53]:
from sklearn.model_selection import train_test_split
X_train, X_Valid, y_train, y_Valid = train_test_split(X,y,test_size=0.2, random_state=49)

# Randomized search CV

In [54]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
model = RandomForestRegressor()

from sklearn.model_selection import RandomizedSearchCV

In [55]:
para={
    "n_estimators": [40,60,80],
    "min_samples_split": [2,5,10],
    "max_depth": [8,10,12],
    "min_samples_leaf": [1,2,4],
}

In [56]:
# search=RandomizedSearchCV(estimator=model,param_distributions=para,n_iter=5,cv=5,verbose=5, n_jobs=-1)
# search.fit(X_train, y_train)

In [57]:
# search.best_params_

In [58]:
model = RandomForestRegressor(n_estimators=80,min_samples_split=2,min_samples_leaf=1,max_depth=10 )
model.fit(X_train, y_train)

In [59]:
train_pred = model.predict(X_train)
Valid_pred = model.predict(X_Valid)
test_pred=model.predict(test_data)

In [60]:
print('Train rmse:', np.sqrt(mean_squared_error(y_train, train_pred)))
print('Test rmse:', np.sqrt(mean_squared_error(y_Valid, Valid_pred)))

In [61]:
U= train_df.drop('item_cnt_month',axis='columns')
V= train_df.item_cnt_month

In [62]:
from sklearn.model_selection import train_test_split
U_train, U_valid, V_train, V_valid = train_test_split(U,V,test_size=0.2,random_state=50)

# Hyperpaarameter Tuning by Randomized CV

In [63]:
params={
 "learning_rate"    : [0.10, 0.20, 0.30 ] ,
 "max_depth"        : [ 10, 12, 15],
 "min_child_weight" : [ 3,5, 7 ],
 "subsample"        : [0.2, 0.4, 0.6],
 "gamma"            : [ 0.3, 0.4, 0.5 ],
}

In [64]:
from xgboost import XGBRegressor
from xgboost import plot_importance
Xg_model=XGBRegressor()

In [65]:
from sklearn.model_selection import RandomizedSearchCV

In [66]:
# random_search=RandomizedSearchCV(estimator=Xg_model,param_distributions=params,n_iter=5,cv=5,verbose=3)
# random_search.fit(U_train, V_train)

In [67]:
# random_search.best_params_

# XGBoost

In [68]:
Xg_model=XGBRegressor(subsample=0.6,min_child_weight=5,max_depth=10,learning_rate=0.3, gamma=0.4)
Xg_model.fit(U_train, V_train)

In [69]:
Xg_train_pred=Xg_model.predict(U_train)
Xg_Valid_pred=Xg_model.predict(U_valid)
Xg_test_pred=Xg_model.predict(test_data)

In [70]:
from sklearn.metrics import mean_squared_error

In [71]:
print('Train rmse:', np.sqrt(mean_squared_error(V_train, Xg_train_pred)))
print('Test rmse:', np.sqrt(mean_squared_error(V_valid, Xg_Valid_pred)))

In [72]:
plt.rcParams["figure.figsize"] = (15, 6)
plot_importance(Xg_model)
plt.show()

# Linear Regression

In [143]:
from sklearn.model_selection import train_test_split
X_train, X_Valid, y_train, y_Valid = train_test_split(X,y,test_size=0.2, random_state=50)

In [144]:
from sklearn.linear_model import LinearRegression

LR_model = LinearRegression(n_jobs=-1)
LR_model.fit(X_train, y_train)

In [145]:
LR_train_pred = LR_model.predict(X_train)
LR_valid_pred = LR_model.predict(X_Valid)
LR_test_pred  = LR_model.predict(test_data)

In [146]:
print('Train rmse:', np.sqrt(mean_squared_error(y_train, LR_train_pred)))
print('Test rmse:', np.sqrt(mean_squared_error(y_Valid, LR_valid_pred)))

In [149]:
DF=pd.DataFrame(unseen_data['ID'], columns=['ID'])
DF['RF']=test_pred
DF['Xg']=Xg_test_pred
DF['LR']=LR_test_pred
DF

In [151]:
import pickle
pickle.dump(model, open('./model.pkl', 'wb'))
pickle.dump(Xg_model, open('./Xg_model.pkl', 'wb'))
pickle.dump(LR_model, open('./LR_model.pkl', 'wb'))