# Goal:
1. The task is to forecast the sales for every item in every shop in the testing period.
2. To apply different strategies and learn.

In [1]:
# Importing required libraries
import pandas as pd
import numpy as np

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

import datetime as dt

In [2]:
# Import customized py library
import customized_py_library as CL

In [3]:
# Loading Data
Train_Data = pd.read_csv("../input/competitive-data-science-predict-future-sales/sales_train.csv")
Test_Data = pd.read_csv("../input/competitive-data-science-predict-future-sales/test.csv")
sample_sub = pd.read_csv("../input/competitive-data-science-predict-future-sales/sample_submission.csv")

# Understanding Data

In [4]:
sample_sub.head()

Unnamed: 0,ID,item_cnt_month
0,0,0.5
1,1,0.5
2,2,0.5
3,3,0.5
4,4,0.5


In [5]:
# Understanding Train Data
# Shape of Data
Train_Data.shape, Test_Data.shape

# Info of Data
#Train_Data.info(), Test_Data.info()

# Print the Data
#Train_Data.head()
#Test_Data.head()

((2935849, 6), (214200, 3))

**Exploring the Train Dataset**

In [6]:
# Unique no. of IDs
Train_Data['shop_id'].nunique(), Train_Data['item_id'].nunique()

(60, 21807)

In [7]:
# Datetime setting
CL.Datetimeset(Train_Data)

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,day,month,year
49800,0,18,5823,2500.0,1.0,1,1,2013
29784,0,27,5573,849.0,1.0,1,1,2013
35476,0,7,1006,399.0,1.0,1,1,2013
8330,0,19,17707,899.0,1.0,1,1,2013
57384,0,14,19548,149.0,1.0,1,1,2013
...,...,...,...,...,...,...,...,...
2885098,33,41,21386,169.0,1.0,31,10,2015
2930981,33,21,988,199.0,1.0,31,10,2015
2885097,33,41,21377,169.0,1.0,31,10,2015
2930993,33,22,10207,1199.0,1.0,31,10,2015


**Finding percentage of Datasets before merging**

In [8]:
Train_Data.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,day,month,year
49800,0,18,5823,2500.0,1.0,1,1,2013
29784,0,27,5573,849.0,1.0,1,1,2013
35476,0,7,1006,399.0,1.0,1,1,2013
8330,0,19,17707,899.0,1.0,1,1,2013
57384,0,14,19548,149.0,1.0,1,1,2013


In [9]:
CL.Percent(2935849, 214200)

No. of Observations (Training Dataset): 0.9320010577613237
No. of Observations (Testing Dataset): 0.0679989422386763


# Exploring relevent columns before merge

In [10]:
CL.Get_colname(Train_Data), CL.Get_colname(Test_Data)

-----------
Shape:  (2935849, 8)
date_block_num
shop_id
item_id
item_price
item_cnt_day
day
month
year
-----------
Shape:  (214200, 3)
ID
shop_id
item_id


(None, None)

In [11]:
Train_Data.shop_id.unique(), Test_Data.shop_id.unique(), np.intersect1d(Train_Data.shop_id, Test_Data.shop_id)

(array([18, 27,  7, 19, 14,  8, 28, 51, 54, 42, 46, 37, 15, 41, 13, 38,  2,
        56,  6,  3, 31, 29, 35, 23, 45,  4, 53, 50, 47, 59, 25, 44, 52, 16,
        32, 22, 26, 21, 24, 30, 10, 43,  1,  0, 12,  5, 57, 58, 55, 17,  9,
        49, 39, 40, 48, 34, 33, 20, 11, 36]),
 array([ 5,  4,  6,  3,  2,  7, 10, 12, 28, 31, 26, 25, 22, 24, 21, 15, 16,
        18, 14, 19, 42, 50, 49, 53, 52, 47, 48, 57, 58, 59, 55, 56, 36, 37,
        35, 38, 34, 46, 41, 44, 39, 45]),
 array([ 2,  3,  4,  5,  6,  7, 10, 12, 14, 15, 16, 18, 19, 21, 22, 24, 25,
        26, 28, 31, 34, 35, 36, 37, 38, 39, 41, 42, 44, 45, 46, 47, 48, 49,
        50, 52, 53, 55, 56, 57, 58, 59]))

In [12]:
Train_Data.item_id.unique(), Test_Data.item_id.unique(), np.intersect1d(Train_Data.item_id, Test_Data.item_id)

(array([ 5823,  5573,  1006, ..., 11905,  7136, 18723]),
 array([ 5037,  5320,  5233, ..., 15757, 19648,   969]),
 array([   30,    31,    32, ..., 22164, 22166, 22167]))

**Lets Merge Training and Testing Dataset for EDA**

In [13]:
#Merged_Data = pd.merge(Train_Data, Test_Data, how = 'left')
DF = pd.merge(Test_Data, Train_Data, on = ('shop_id', 'item_id'), how = 'left')

In [14]:
CL.Get_colname(DF)

-----------
Shape:  (1327235, 9)
ID
shop_id
item_id
date_block_num
item_price
item_cnt_day
day
month
year


In [15]:
DF.head(30)

Unnamed: 0,ID,shop_id,item_id,date_block_num,item_price,item_cnt_day,day,month,year
0,0,5,5037,20.0,2599.0,1.0,21.0,9.0,2014.0
1,0,5,5037,22.0,2599.0,1.0,29.0,11.0,2014.0
2,0,5,5037,23.0,1999.0,1.0,20.0,12.0,2014.0
3,0,5,5037,23.0,1999.0,1.0,28.0,12.0,2014.0
4,0,5,5037,24.0,1999.0,1.0,2.0,1.0,2015.0
5,0,5,5037,24.0,1999.0,1.0,7.0,1.0,2015.0
6,0,5,5037,28.0,1299.0,1.0,29.0,5.0,2015.0
7,0,5,5037,29.0,1499.0,1.0,28.0,6.0,2015.0
8,0,5,5037,30.0,1499.0,1.0,5.0,7.0,2015.0
9,0,5,5037,31.0,1499.0,1.0,14.0,8.0,2015.0


# Exploratory Data Analysis

1. Data Cleaning.
2. Data Preprocessing.
3. Data visualization.

1. Data Cleaning

In [16]:
CL.find_NaN_V(DF)

Unnamed: 0,NaN values,NaN percent
ID,0,0.0
shop_id,0,0.0
item_id,0,0.0
date_block_num,102796,7.745124
item_price,102796,7.745124
item_cnt_day,102796,7.745124
day,102796,7.745124
month,102796,7.745124
year,102796,7.745124


# Train & Test split