# Goal:
1. The task is to forecast the sales for every item in every shop in the testing period.
2. To apply different strategies and learn.

In [1]:
# Importing required libraries
import pandas as pd

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

import datetime as dt

In [2]:
# Importing required functions

# Data time formula
def Datetimeset(DF):
    
    """
    Takes DataFrame and convert sting feature named "date" to datetime feature and also create 
    three saperate columns namely day, month, year.
    """
    
    import datetime as dt
    
    DF['date'] = pd.to_datetime(DF['date'], format='%d.%m.%Y')
    
    DF.sort_values(by = 'date', inplace = True)
    DF['day'] = DF['date'].dt.day
    DF['month'] = DF['date'].dt.month
    DF['year'] = DF['date'].dt.year
    
    DF.drop('date', axis=1, inplace=True)
    
    return DF


# Lets find percentage of share
def Percent(Tr_Observations, Test_Observations):
    
    Tr_Obs_percent = Tr_Observations/(Tr_Observations + Test_Observations)
    Test_Obs_percent = Test_Observations/(Tr_Observations + Test_Observations)
    print('Training Dataset percentage:', Tr_Obs_percent)
    print('Testing Dataset percentage:', Test_Obs_percent)

# Let's find Null values
def find_NaN_V(DF):
    Nullvalues = pd.DataFrame({'NaN values': DF.isnull().sum(), 'NaN percent': DF.isnull().mean()*100})
    return Nullvalues

# Lets find duplicated values and remove them
def Drop_duplicate_value(DF):
    print('Duplcated observations:', DF.duplicated().sum())
    Droped_value_count = DF.duplicated().sum()
    DF = DF.drop_duplicates()
    
    print('Droped observations:', Droped_value_count)
    print('New Shape:', DF.shape)
    

# Lets do linear regression
def Linear_Reg(Tr_Exmpls, Tr_lbls, Val_Exmpls, Val_lbls):
    
    from sklearn.linear_model import LinearRegression
    from sklearn.metrics import r2_score
    
    model = LinearRegression().fit(Tr_Exmpls, Tr_lbls)
    print('Training Score:', model.score(Tr_Exmpls, Tr_lbls))
    print('Validation Score:', model.score(Val_Exmpls, Val_lbls))


In [3]:
# Loading Data
Train_Data = pd.read_csv("../input/competitive-data-science-predict-future-sales/sales_train.csv")
Test_Data = pd.read_csv("../input/competitive-data-science-predict-future-sales/test.csv")
sample_sub = pd.read_csv("../input/competitive-data-science-predict-future-sales/sample_submission.csv")

# Understanding Data

In [4]:
sample_sub.head()

Unnamed: 0,ID,item_cnt_month
0,0,0.5
1,1,0.5
2,2,0.5
3,3,0.5
4,4,0.5


In [5]:
# Understanding Train Data
# Shape of Data
Train_Data.shape, Test_Data.shape

# Info of Data
#Train_Data.info(), Test_Data.info()

# Print the Data
#Train_Data.head()
#Test_Data.head()

((2935849, 6), (214200, 3))

**Exploring the Train Dataset**

In [6]:
# Unique no. of IDs
Train_Data['shop_id'].nunique(), Train_Data['item_id'].nunique()

(60, 21807)

In [7]:
# Datetime setting
Datetimeset(Train_Data)

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,day,month,year
49800,0,18,5823,2500.0,1.0,1,1,2013
29784,0,27,5573,849.0,1.0,1,1,2013
35476,0,7,1006,399.0,1.0,1,1,2013
8330,0,19,17707,899.0,1.0,1,1,2013
57384,0,14,19548,149.0,1.0,1,1,2013
...,...,...,...,...,...,...,...,...
2885098,33,41,21386,169.0,1.0,31,10,2015
2930981,33,21,988,199.0,1.0,31,10,2015
2885097,33,41,21377,169.0,1.0,31,10,2015
2930993,33,22,10207,1199.0,1.0,31,10,2015


**Finding percentage of Datasets before merging**

In [8]:
Percent(2935849, 214200)

Training Dataset percentage: 0.9320010577613237
Testing Dataset percentage: 0.0679989422386763


**Lets Merge Training and Testing Dataset for EDA**

In [9]:
#Merged_Data = pd.merge(Train_Data, Test_Data, how = 'left')
DF = pd.merge(Test_Data, Train_Data, on = ('shop_id', 'item_id'), how = 'left')

In [10]:
DF.shape

(1327235, 9)

In [11]:
Drop_duplicate_value(DF)

Duplcated observations: 5
Droped observations: 5
New Shape: (1327230, 9)


In [12]:
DF['shop_id'].nunique(), DF['item_id'].nunique()

(42, 5100)

# Exploratory Data Analysis

1. Data Cleaning.
2. Data Preprocessing.
3. Data visualization.

1. Data Cleaning

In [13]:
# Handling NaN values
find_NaN_V(DF)

Unnamed: 0,NaN values,NaN percent
ID,0,0.0
shop_id,0,0.0
item_id,0,0.0
date_block_num,102796,7.745124
item_price,102796,7.745124
item_cnt_day,102796,7.745124
day,102796,7.745124
month,102796,7.745124
year,102796,7.745124


# Train & Test split

In [14]:
import numpy as np 
Train_set, Val_set= np.split(Train_Data, [int(.67 *len(Train_Data))])

In [15]:
Train_set.shape, Val_set.shape

((1967018, 8), (968831, 8))

In [16]:
Train_set.head(2000)

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,day,month,year
49800,0,18,5823,2500.0,1.0,1,1,2013
29784,0,27,5573,849.0,1.0,1,1,2013
35476,0,7,1006,399.0,1.0,1,1,2013
8330,0,19,17707,899.0,1.0,1,1,2013
57384,0,14,19548,149.0,1.0,1,1,2013
...,...,...,...,...,...,...,...,...
74079,0,56,1869,799.0,1.0,2,1,2013
34264,0,6,17241,349.0,1.0,2,1,2013
34304,0,6,13165,249.0,1.0,2,1,2013
106266,0,35,19532,349.0,1.0,2,1,2013


# Examples' & labels' setting and Modelling

In [17]:
# Examples' & labels' setting
Tr_Exmpls = Train_set.drop('item_cnt_day', axis=1)
Tr_lbls = Train_set['item_cnt_day']

Val_Exmpls = Val_set.drop('item_cnt_day', axis=1)
Val_lbls = Val_set['item_cnt_day']

In [18]:
# Modelling
Linear_Reg(Tr_Exmpls, Tr_lbls, Val_Exmpls, Val_lbls)

Training Score: 0.0016844383574042832
Validation Score: 0.00023410472981844066
