## Data Preprocessing

In [1]:
# Importing Necessary Packages
import warnings
warnings.filterwarnings("ignore")

import time
from datetime import date, timedelta, datetime
import numpy as np
import pandas as pd
import statistics as st
import scipy.stats as ss
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import datetime
from pandas import Series,DataFrame

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns # advanced vizs
%matplotlib inline

## Setting Logging

In [2]:
import logging
import logging.handlers
import os
 
handler = logging.handlers.WatchedFileHandler(
    os.environ.get("LOGFILE", "../logs/prediction.log"))
formatter = logging.Formatter(logging.BASIC_FORMAT)
handler.setFormatter(formatter)
root = logging.getLogger()
root.setLevel(os.environ.get("LOGLEVEL", "INFO"))
root.addHandler(handler)
logging.info("Testing Loggings") 
try:
    exit(main())
except Exception:
    logging.exception("Exception in main()")
    exit(1)

## Creating Dataframe Class, and object

In [3]:
# Class to load the data
class FetchData():
    def __init__(self):
        self.dfdict = {}
        self.dfdict['train'] = self.get_train_data()
        self.dfdict['test'] = self.get_test_data()
        self.dfdict['sample'] = self.get_sample_data()
        self.dfdict['store'] = self.get_store_data()
        
    def get_train_data(self,name='train'):
        filename = f'../data/{name}.csv'
        try:
            df = pd.read_csv(filename)
            logging.info(f"{name} Dataset read successfully")
            return df

        except Exception as e:
            logging.exception(f" Exception occured in reading dataset, {e}")
            return None
    
    def get_test_data(self,name='test'):
        filename = f'../data/{name}.csv'        
        try:
            df = pd.read_csv(filename)
            logging.info(f"{name} Dataset read successfully")
            return df   

        except Exception as e:
            logging.exception(f" Exception occured in reading dataset, {e}")
            return None
        
    def get_store_data(self,name='store'):
        filename = f'../data/{name}.csv'        
        try:
            df = pd.read_csv(filename)
            logging.info(f"{name} Dataset read successfully") 
            return df  

        except Exception as e:
            logging.exception(f" Exception occured in reading dataset, {e}")
            return None 
    
    def get_sample_data(self,name='sample_submission'):
        filename = f'../data/{name}.csv'        
        try:
            df = pd.read_csv(filename)
            logging.info(f"{name} Dataset read successfully")  
            return df  

        except Exception as e:
            logging.exception(f" Exception occured in reading dataset, {e}")
            return None 
    
    def show_sample_data(self,dataset='train', sample=5):
        try:
            df = dataset.head(sample)
            return df  

        except Exception as e:
            logging.exception(f" Exception occured in getting sample data of a dataset, {e}")
            return None

    def find_null_values(self,dataset='train'):
        try:
            df = dataset.isnull().sum()
            logging.info("Getting Null values, Execution successfuly")
            return df  

        except Exception as e:
            logging.exception(f" Exception in getting Null values, {e}")
            return None

    def get_start_end_date(self, dataset_name='train', date_column='Date'):
        try:
            start_date = dataset_name[date_column].min()
            end_date = dataset_name[date_column].max()
            logging.info("Getting start and End date successfully")
            return start_date, end_date

        except Exception as e:
            logging.exception(f"Exception in getting start and end date, {e}")

            return None, None

    def join_dataset(self, dataset='train', dataset1='store'):
        if dataset.Store.nunique() == dataset1.Store.nunique():
            try:
                df_combined = dataset.merge(dataset1, how='left', left_on=dataset.Store, right_on=dataset1.Store)
                df_combined.drop(['key_0', 'Store_y'], axis=1, inplace=True)
                df_combined = df_combined.rename(columns={'Store_x':'Store'})
                logging.info(f" Joining {dataset} and {dataset1} datasets successfully")

                return df_combined.shape, df_combined
            
            except Exception as e:
                logging.debug(f"Exception in Joining {dataset} and {dataset1} datasets, {e}")
                return None, None

        else:
            logging.error("The values in the dataset are not compartible")
            print("The values in the dataset are not compartible")

    def add_day_month_year_to_dataset(self, dataset, column_list = ['day','month','year']):
        try:
            dataset.Date = pd.to_datetime(dataset.Date)
            for column in column_list:
                dataset[column] = dataset.Date.dt.column
                logging.info(f"Adding {column} column to dataset successfully")
                return dataset

        except Exception as e:
            logging.exception(f"Exception occured in Adding columns in dataset, Exception:{e}")
             
            return None

In [4]:
#  Creating FetchData Object
data = FetchData()

### Loading Datasets

In [5]:
train_data = data.get_train_data("train")
store_data = data.get_train_data("store")
test_data = data.get_train_data("test")
sample_submission_data = data.get_train_data("sample_submission")

In [6]:
print(f" Train Data with shape of: {train_data.shape}")
print(train_data.info())
print(f" Store Data with sape of {store_data.shape}")
print(store_data.info())
print(f" Test Data with sape of {test_data.shape}")
print(test_data.info())
print(f" Sample Submission with sape of {sample_submission_data.shape}")
print(sample_submission_data.info())

 Train Data with shape of: (1017209, 9)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 9 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   Store          1017209 non-null  int64 
 1   DayOfWeek      1017209 non-null  int64 
 2   Date           1017209 non-null  object
 3   Sales          1017209 non-null  int64 
 4   Customers      1017209 non-null  int64 
 5   Open           1017209 non-null  int64 
 6   Promo          1017209 non-null  int64 
 7   StateHoliday   1017209 non-null  object
 8   SchoolHoliday  1017209 non-null  int64 
dtypes: int64(7), object(2)
memory usage: 69.8+ MB
None
 Store Data with sape of (1115, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1115 entries, 0 to 1114
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Store                      1115 non-n

### It can be seen that, store dataset has many missing values, and test dataset has 11 missing value

## Handling 'NA' values
###  - Store dataset

In [7]:
store_data.isna().sum()

Store                          0
StoreType                      0
Assortment                     0
CompetitionDistance            3
CompetitionOpenSinceMonth    354
CompetitionOpenSinceYear     354
Promo2                         0
Promo2SinceWeek              544
Promo2SinceYear              544
PromoInterval                544
dtype: int64

In [8]:
# Sorting the dataset by Competition Distance, because competition distance has less 'NA' values
store_data.sort_values(by='CompetitionDistance')

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
515,516,a,c,20.0,,,1,35.0,2010.0,"Mar,Jun,Sept,Dec"
881,882,a,a,30.0,4.0,2013.0,0,,,
1007,1008,a,c,30.0,9.0,2010.0,0,,,
620,621,a,a,30.0,7.0,2002.0,0,,,
987,988,a,a,30.0,11.0,2012.0,0,,,
...,...,...,...,...,...,...,...,...,...,...
121,122,a,c,58260.0,4.0,2013.0,0,,,
452,453,a,c,75860.0,,,0,,,
290,291,d,a,,,,0,,,
621,622,a,c,,,,0,,,


### Handling the CompetitionDistance 'NA' values by filling with mean

In [11]:
store_data.CompetitionDistance.fillna(store_data.CompetitionDistance.mean(),inplace=True)
# store_data.sort_values(by='CompetitionDistance')

### Dealing with 'CompetitionOpenSinceMonth, CompetitionOpenSinceYear' NA values

In [12]:
store_data[store_data['CompetitionOpenSinceYear'].isna() == True].describe()


Unnamed: 0,Store,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear
count,354.0,354.0,0.0,0.0,354.0,206.0,206.0
mean,560.584746,5430.634755,,,0.581921,25.480583,2011.567961
std,321.188639,7375.534157,,,0.493941,14.858261,1.638781
min,12.0,20.0,,,0.0,1.0,2009.0
25%,277.5,647.5,,,0.0,13.0,2010.0
50%,529.0,2560.0,,,1.0,27.0,2012.0
75%,852.5,7877.5,,,1.0,40.0,2013.0
max,1115.0,75860.0,,,1.0,50.0,2015.0


### It can be seen that, both 'CompetitionOpenSinceMonth, CompetitionOpenSinceYear' are not-continuous data, therefore 'NA' will be filled with Mode

In [16]:
store_data.CompetitionOpenSinceMonth.fillna(store_data.CompetitionOpenSinceMonth.mode()[0],inplace=True)
store_data.CompetitionOpenSinceYear.fillna(store_data.CompetitionOpenSinceYear.mode()[0],inplace=True)
store_data.sort_values(by='CompetitionOpenSinceYear')

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
545,546,a,a,580.0,1.0,2005.0,0,,,
826,827,a,c,250.0,1.0,2005.0,0,,,
759,760,a,a,560.0,1.0,2011.0,0,,,
554,555,d,a,1560.0,1.0,2014.0,1,10.0,2013.0,"Mar,Jun,Sept,Dec"
138,139,a,a,1700.0,1.0,2008.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
...,...,...,...,...,...,...,...,...,...,...
126,127,d,a,1350.0,12.0,2005.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
74,75,d,c,22440.0,12.0,2013.0,0,,,
888,889,d,a,18670.0,12.0,2005.0,0,,,
647,648,d,a,2130.0,12.0,2008.0,0,,,
