In [2]:
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import sys
import os

In [3]:
sys.path.append(os.path.abspath(os.path.join('../scripts')))

In [23]:
from cleaner import *
from file_handler import FileHandler
from plot import *
from selector import *

### Reading Data

In [6]:
file_handler = FileHandler()

In [7]:
# reading the store csv file
store_df = file_handler.read_csv("../data/store.csv")
store_df.head(10)

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,c,c,620.0,9.0,2009.0,0,,,
4,5,a,a,29910.0,4.0,2015.0,0,,,
5,6,a,a,310.0,12.0,2013.0,0,,,
6,7,a,c,24000.0,4.0,2013.0,0,,,
7,8,a,a,7520.0,10.0,2014.0,0,,,
8,9,a,c,2030.0,8.0,2000.0,0,,,
9,10,a,a,3160.0,9.0,2009.0,0,,,


In [8]:
# reading the sales training csv file
train_df = file_handler.read_csv("../data/train.csv")
train_df.head(10)

  train_df = file_handler.read_csv("../data/train.csv")


Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1
5,6,5,2015-07-31,5651,589,1,1,0,1
6,7,5,2015-07-31,15344,1414,1,1,0,1
7,8,5,2015-07-31,8492,833,1,1,0,1
8,9,5,2015-07-31,8565,687,1,1,0,1
9,10,5,2015-07-31,7185,681,1,1,0,1


### General Statistics

#### Store_df

In [9]:
# number of elements in the store df
store_df.size

11150

In [10]:
# rows and columns in the df
store_df.shape

(1115, 10)

In [11]:
store_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1115 entries, 0 to 1114
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Store                      1115 non-null   int64  
 1   StoreType                  1115 non-null   object 
 2   Assortment                 1115 non-null   object 
 3   CompetitionDistance        1112 non-null   float64
 4   CompetitionOpenSinceMonth  761 non-null    float64
 5   CompetitionOpenSinceYear   761 non-null    float64
 6   Promo2                     1115 non-null   int64  
 7   Promo2SinceWeek            571 non-null    float64
 8   Promo2SinceYear            571 non-null    float64
 9   PromoInterval              571 non-null    object 
dtypes: float64(5), int64(2), object(3)
memory usage: 87.2+ KB


### train_df

In [12]:
# number of elements in the train df
train_df.size

9154881

In [13]:
# rows and columns in the df
train_df.shape

(1017209, 9)

In [14]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 9 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   Store          1017209 non-null  int64 
 1   DayOfWeek      1017209 non-null  int64 
 2   Date           1017209 non-null  object
 3   Sales          1017209 non-null  int64 
 4   Customers      1017209 non-null  int64 
 5   Open           1017209 non-null  int64 
 6   Promo          1017209 non-null  int64 
 7   StateHoliday   1017209 non-null  object
 8   SchoolHoliday  1017209 non-null  int64 
dtypes: int64(7), object(2)
memory usage: 69.8+ MB


### Missing Values

#### store_df

In [15]:
percent_missing_values(store_df)

The dataset contains 21.01 % missing values.


In [16]:
missing_df = missing_values_table(store_df)

Your selected dataframe has 10 columns.
There are 6 columns that have missing values.


In [17]:
missing_df

Unnamed: 0,Missing Values,% of Total Values,Dtype
Promo2SinceWeek,544,48.79,float64
Promo2SinceYear,544,48.79,float64
PromoInterval,544,48.79,object
CompetitionOpenSinceMonth,354,31.75,float64
CompetitionOpenSinceYear,354,31.75,float64
CompetitionDistance,3,0.27,float64


This shows that the first 3 and the second 2 columns have same number of missing values

In [18]:
# number of rows with missing values for the whole dataset
count_missing_rows(store_df)

750 rows(67.26%) contain atleast one missing value.


In [19]:
# number of rows with missing values for group 1 (Promo2SinceWeek, Promo2SinceYear, and PromoInterval)
count_missing_rows(store_df[['Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval']])

544 rows(48.79%) contain atleast one missing value.


In [20]:
# number of rows with missing values for group 2 (CompetitionOpenSinceMonth, and CompetitionOpenSinceYear)
count_missing_rows(store_df[['CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear']])

354 rows(31.75%) contain atleast one missing value.


Each columuns in these groups have missing values in the same rows as their groupmates

In [21]:
# dataframe containing the missing rows for columns in group 1
group1_df = store_df[store_df['Promo2SinceWeek'].isna()]
group1_df.head(10)

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270.0,9.0,2008.0,0,,,
3,4,c,c,620.0,9.0,2009.0,0,,,
4,5,a,a,29910.0,4.0,2015.0,0,,,
5,6,a,a,310.0,12.0,2013.0,0,,,
6,7,a,c,24000.0,4.0,2013.0,0,,,
7,8,a,a,7520.0,10.0,2014.0,0,,,
8,9,a,c,2030.0,8.0,2000.0,0,,,
9,10,a,a,3160.0,9.0,2009.0,0,,,
15,16,a,c,3270.0,,,0,,,
22,23,d,a,4060.0,8.0,2005.0,0,,,


In [24]:
# all unique values for each column
unique_values_df(group1_df)

Unnamed: 0,Column,Unique values
0,Store,"[1, 758, 741, 742, 747, 751, 755, 757, 760, 4,..."
1,StoreType,"[a, d, c, b]"
2,Assortment,"[a, c, b]"
3,CompetitionDistance,"[250.0, 140.0, 50.0, 2640.0, 30.0, 420.0, 720...."
4,CompetitionOpenSinceMonth,"[9.0, 4.0, 11.0, 3.0, 12.0, 10.0, 7.0, 6.0, 2...."
5,CompetitionOpenSinceYear,"[2013.0, 2012.0, 2014.0, 2009.0, 2010.0, 2005...."
6,Promo2,[0]
7,Promo2SinceWeek,[]
8,Promo2SinceYear,[]
9,PromoInterval,[]


In [25]:
group1_df['Promo2'].value_counts()

0    544
Name: Promo2, dtype: int64

All 544 values are 0

In [26]:
store_df['Promo2'].value_counts()

1    571
0    544
Name: Promo2, dtype: int64

All rows with 0 value in their Promo2 columns have null values in the columns Promo2SinceWeek, Promo2SinceYear, and PromoInterval.

In [27]:
# check the exact data type of the object
pd.api.types.infer_dtype(store_df['PromoInterval'])

'string'

In [28]:
# all the unique values 
store_df['PromoInterval'].value_counts()

Jan,Apr,Jul,Oct     335
Feb,May,Aug,Nov     130
Mar,Jun,Sept,Dec    106
Name: PromoInterval, dtype: int64

In [29]:
#Check if 0 value exists in each columns if not replace the missing values with tha value
store_df[store_df['Promo2SinceWeek'] == 0].shape

(0, 10)

In [30]:
store_df[store_df['Promo2SinceYear'] == 0].shape

(0, 10)

In [31]:
store_df[store_df['PromoInterval'] == '0,0,0,0'].shape

(0, 10)

Doesnt exist, so replace them with zero

In [32]:
fix_missing_value(store_df, ['Promo2SinceWeek', 'Promo2SinceYear'], 0)

544 missing values in the column Promo2SinceWeek have been replaced by 0.
544 missing values in the column Promo2SinceYear have been replaced by 0.


In [33]:
fix_missing_value(store_df, ['PromoInterval'], '0,0,0,0')

544 missing values in the column PromoInterval have been replaced by 0,0,0,0.


In [34]:
# dataframe containing the missing rows for columns in group 1
group2_df = store_df[store_df['CompetitionOpenSinceMonth'].isna()]
group2_df.head(10)

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
11,12,a,c,1070.0,,,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
12,13,d,a,310.0,,,1,45.0,2009.0,"Feb,May,Aug,Nov"
15,16,a,c,3270.0,,,0,0.0,0.0,0000
18,19,a,c,3240.0,,,1,22.0,2011.0,"Mar,Jun,Sept,Dec"
21,22,a,a,1040.0,,,1,22.0,2012.0,"Jan,Apr,Jul,Oct"
25,26,d,a,2300.0,,,0,0.0,0.0,0000
28,29,d,c,2170.0,,,0,0.0,0.0,0000
31,32,a,a,2910.0,,,1,45.0,2009.0,"Feb,May,Aug,Nov"
39,40,a,a,180.0,,,1,45.0,2009.0,"Feb,May,Aug,Nov"
40,41,d,c,1180.0,,,1,31.0,2013.0,"Jan,Apr,Jul,Oct"


In [35]:
# all unique values for each column in the dataframe
unique_values_df(group2_df)

Unnamed: 0,Column,Unique values
0,Store,"[12, 853, 767, 766, 764, 762, 757, 756, 742, 7..."
1,StoreType,"[a, d, c, b]"
2,Assortment,"[c, a, b]"
3,CompetitionDistance,"[250.0, 2410.0, 50.0, 210.0, 840.0, 220.0, 150..."
4,CompetitionOpenSinceMonth,[]
5,CompetitionOpenSinceYear,[]
6,Promo2,"[1, 0]"
7,Promo2SinceWeek,"[0.0, 40.0, 14.0, 45.0, 31.0, 22.0, 5.0, 1.0, ..."
8,Promo2SinceYear,"[0.0, 2013.0, 2011.0, 2009.0, 2012.0, 2014.0, ..."
9,PromoInterval,"[0,0,0,0, Jan,Apr,Jul,Oct, Feb,May,Aug,Nov, Ma..."


In [36]:
# deriving new column for indication presence of competition using 0 and 1 
store_df['CompetitionBeforeStoreOpened'] = store_df['CompetitionOpenSinceYear'].apply(lambda x: 1 if np.isnan(x) else 0)
store_df

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,CompetitionBeforeStoreOpened
0,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0000,0
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct",0
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct",0
3,4,c,c,620.0,9.0,2009.0,0,0.0,0.0,0000,0
4,5,a,a,29910.0,4.0,2015.0,0,0.0,0.0,0000,0
...,...,...,...,...,...,...,...,...,...,...,...
1110,1111,a,a,1900.0,6.0,2014.0,1,31.0,2013.0,"Jan,Apr,Jul,Oct",0
1111,1112,c,c,1880.0,4.0,2006.0,0,0.0,0.0,0000,0
1112,1113,a,c,9260.0,,,0,0.0,0.0,0000,1
1113,1114,a,c,870.0,,,0,0.0,0.0,0000,1


find the minimum year value for CompetitionOpenSinceYear and impute the missing values

In [37]:
min_year = store_df['CompetitionOpenSinceYear'].min()
min_year

1900.0

In [38]:
# impute the column CompetitionOpenSinceYear with 1900 and the column CompetitionOpenSinceMonth with 1
fix_missing_value(store_df, ['CompetitionOpenSinceYear'], min_year)
fix_missing_value(store_df, ['CompetitionOpenSinceMonth'], 1)

354 missing values in the column CompetitionOpenSinceYear have been replaced by 1900.0.
354 missing values in the column CompetitionOpenSinceMonth have been replaced by 1.


In [39]:
store_df

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,CompetitionBeforeStoreOpened
0,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0000,0
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct",0
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct",0
3,4,c,c,620.0,9.0,2009.0,0,0.0,0.0,0000,0
4,5,a,a,29910.0,4.0,2015.0,0,0.0,0.0,0000,0
...,...,...,...,...,...,...,...,...,...,...,...
1110,1111,a,a,1900.0,6.0,2014.0,1,31.0,2013.0,"Jan,Apr,Jul,Oct",0
1111,1112,c,c,1880.0,4.0,2006.0,0,0.0,0.0,0000,0
1112,1113,a,c,9260.0,1.0,1900.0,0,0.0,0.0,0000,1
1113,1114,a,c,870.0,1.0,1900.0,0,0.0,0.0,0000,1


In [74]:
#rows with missing value left
temp_df = store_df[store_df['CompetitionDistance'].isna()]
temp_df
##Rerunned after cleaning all the missing value thats why we csnt see the columns let to clean now

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,CompetitionBeforeStoreOpened,PromoInterval0,PromoInterval1,PromoInterval2,PromoInterval3


In [41]:
unique_values_df(temp_df)

Unnamed: 0,Column,Unique values
0,Store,"[291, 622, 879]"
1,StoreType,"[d, a]"
2,Assortment,"[a, c]"
3,CompetitionDistance,[]
4,CompetitionOpenSinceMonth,[1.0]
5,CompetitionOpenSinceYear,[1900.0]
6,Promo2,"[0, 1]"
7,Promo2SinceWeek,"[0.0, 5.0]"
8,Promo2SinceYear,"[0.0, 2013.0]"
9,PromoInterval,"[0,0,0,0, Feb,May,Aug,Nov]"


In [42]:
max_dist = store_df['CompetitionDistance'].max()
max_dist

75860.0

In [43]:
fix_missing_value(store_df, ['CompetitionDistance'], max_dist)

3 missing values in the column CompetitionDistance have been replaced by 75860.0.


In [44]:
# final check for missing values
percent_missing_values(store_df)

The dataset contains 0.0 % missing values.


### `train_df

In [46]:
percent_missing_values(train_df)

The dataset contains 0.0 % missing values.


### Data Types

### store_df

In [47]:
#check if there is a mixed data type 
show_cols_mixed_dtypes(store_df)

None of the columns contain mixed types.


In [48]:
store_df.dtypes

Store                             int64
StoreType                        object
Assortment                       object
CompetitionDistance             float64
CompetitionOpenSinceMonth       float64
CompetitionOpenSinceYear        float64
Promo2                            int64
Promo2SinceWeek                 float64
Promo2SinceYear                 float64
PromoInterval                    object
CompetitionBeforeStoreOpened      int64
dtype: object

In [49]:
store_df.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,CompetitionBeforeStoreOpened
0,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0000,0
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct",0
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct",0
3,4,c,c,620.0,9.0,2009.0,0,0.0,0.0,0000,0
4,5,a,a,29910.0,4.0,2015.0,0,0.0,0.0,0000,0


we will have to convert the data type of PromoInterval in to string  

In [50]:
# get the columns with object data type
string_columns = store_df.select_dtypes(include='object').columns.tolist()
string_columns

['StoreType', 'Assortment', 'PromoInterval']

In [51]:
convert_to_string(store_df, string_columns)

In [52]:
#convert those columns in to int64 
convert_to_int(store_df, ['CompetitionOpenSinceMonth',  'CompetitionOpenSinceYear',
        'Promo2SinceWeek', 'Promo2SinceYear'])

In [53]:
store_df.dtypes

Store                             int64
StoreType                        string
Assortment                       string
CompetitionDistance             float64
CompetitionOpenSinceMonth         int64
CompetitionOpenSinceYear          int64
Promo2                            int64
Promo2SinceWeek                   int64
Promo2SinceYear                   int64
PromoInterval                    string
CompetitionBeforeStoreOpened      int64
dtype: object

In [54]:
store_df.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,CompetitionBeforeStoreOpened
0,1,c,a,1270.0,9,2008,0,0,0,0000,0
1,2,a,a,570.0,11,2007,1,13,2010,"Jan,Apr,Jul,Oct",0
2,3,a,a,14130.0,12,2006,1,14,2011,"Jan,Apr,Jul,Oct",0
3,4,c,c,620.0,9,2009,0,0,0,0000,0
4,5,a,a,29910.0,4,2015,0,0,0,0000,0


### train_df

Check if there are columns with mixed data types.

In [55]:
show_cols_mixed_dtypes(train_df)

         Column      Data type
0  StateHoliday  mixed-integer


check the unique value it contains

In [56]:
train_df['StateHoliday'].value_counts()

0    855087
0    131072
a     20260
b      6690
c      4100
Name: StateHoliday, dtype: int64

In [57]:
train_df['StateHoliday'].value_counts().index

Index(['0', 0, 'a', 'b', 'c'], dtype='object')

In [58]:
convert_to_string(train_df, ['StateHoliday'])

In [59]:
train_df.dtypes

Store             int64
DayOfWeek         int64
Date             object
Sales             int64
Customers         int64
Open              int64
Promo             int64
StateHoliday     string
SchoolHoliday     int64
dtype: object

In [60]:
train_df.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1


In [61]:
convert_to_datetime(train_df, ['Date'])

In [62]:
train_df.dtypes

Store                     int64
DayOfWeek                 int64
Date             datetime64[ns]
Sales                     int64
Customers                 int64
Open                      int64
Promo                     int64
StateHoliday             string
SchoolHoliday             int64
dtype: object

### Duplicates 

### store_df

In [63]:
# search for duplicate rows and drop them
drop_duplicates(store_df)

No duplicate rows were found.


In [64]:
store_df.duplicated(subset=['Store']).all()

False

### train_df

In [65]:
# search for duplicate rows and drop them
drop_duplicates(train_df)

No duplicate rows were found.


In [66]:
train_df.duplicated(subset=['Store', 'Date']).all()

False

## Feature Engineering

### train_df

In [67]:
train_df['Year'] = train_df['Date'].apply(lambda x: x.year)
train_df['Month'] = train_df['Date'].apply(lambda x: x.month)
train_df['DayOfMonth'] = train_df['Date'].apply(lambda x: x.day)
train_df['WeekOfYear'] = train_df['Date'].apply(lambda x: x.weekofyear)
train_df['weekday'] = train_df['DayOfWeek'].apply(lambda x: 0 if (x in [6, 7]) else 1)

In [68]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1017209 entries, 0 to 1017208
Data columns (total 14 columns):
 #   Column         Non-Null Count    Dtype         
---  ------         --------------    -----         
 0   Store          1017209 non-null  int64         
 1   DayOfWeek      1017209 non-null  int64         
 2   Date           1017209 non-null  datetime64[ns]
 3   Sales          1017209 non-null  int64         
 4   Customers      1017209 non-null  int64         
 5   Open           1017209 non-null  int64         
 6   Promo          1017209 non-null  int64         
 7   StateHoliday   1017209 non-null  string        
 8   SchoolHoliday  1017209 non-null  int64         
 9   Year           1017209 non-null  int64         
 10  Month          1017209 non-null  int64         
 11  DayOfMonth     1017209 non-null  int64         
 12  WeekOfYear     1017209 non-null  int64         
 13  weekday        1017209 non-null  int64         
dtypes: datetime64[ns](1), int64(12), s

In [69]:
train_df.sample(10)

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Year,Month,DayOfMonth,WeekOfYear,weekday
413900,1021,4,2014-06-26,7072,913,1,0,0,0,2014,6,26,26,1
327073,1114,5,2014-09-26,17800,2878,1,0,0,0,2014,9,26,39,1
447202,873,2,2014-05-27,3355,440,1,0,0,0,2014,5,27,22,1
670766,322,4,2013-11-07,5916,655,1,1,0,0,2013,11,7,45,1
223519,520,1,2015-01-12,4305,402,1,1,0,0,2015,1,12,3,1
727399,90,2,2013-09-17,6329,878,1,0,0,0,2013,9,17,38,1
999476,107,3,2013-01-16,5826,830,1,0,0,0,2013,1,16,3,1
38327,418,6,2015-06-27,3008,317,1,0,0,0,2015,6,27,26,0
246619,1061,7,2014-12-21,0,0,0,0,0,0,2014,12,21,51,0
110232,963,5,2015-04-24,7455,786,1,0,0,0,2015,4,24,17,1


### store_df

In [70]:
def getMonth(month_list, index):
    months = ['0', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sept', 'Oct', 'Nov', 'Dec']
    month_list = month_list.split(',')
    month = month_list[index]
    return months.index(month)

In [71]:
# split the PromoInterval column into 4 columns
store_df['PromoInterval0'] = store_df.PromoInterval.apply((lambda x: getMonth(x, 0)))
store_df['PromoInterval1'] = store_df.PromoInterval.apply((lambda x: getMonth(x, 1)))
store_df['PromoInterval2'] = store_df.PromoInterval.apply((lambda x: getMonth(x, 2)))
store_df['PromoInterval3'] = store_df.PromoInterval.apply((lambda x: getMonth(x, 3)))

In [72]:
store_df.sample(10)

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,CompetitionBeforeStoreOpened,PromoInterval0,PromoInterval1,PromoInterval2,PromoInterval3
848,849,c,c,5000.0,1,1900,0,0,0,0000,1,0,0,0,0
716,717,d,c,310.0,1,1900,1,40,2011,"Jan,Apr,Jul,Oct",1,1,4,7,10
586,587,d,c,330.0,9,2006,1,14,2011,"Jan,Apr,Jul,Oct",0,1,4,7,10
442,443,d,a,11400.0,12,2005,0,0,0,0000,0,0,0,0,0
189,190,a,a,1470.0,12,2006,1,40,2014,"Jan,Apr,Jul,Oct",0,1,4,7,10
713,714,d,c,12070.0,10,2005,1,10,2013,"Jan,Apr,Jul,Oct",0,1,4,7,10
99,100,d,a,17930.0,1,1900,0,0,0,0000,1,0,0,0,0
795,796,a,c,7180.0,11,2012,0,0,0,0000,0,0,0,0,0
756,757,a,c,3450.0,1,1900,0,0,0,0000,1,0,0,0,0
535,536,a,c,4700.0,9,2002,1,31,2013,"Feb,May,Aug,Nov",0,2,5,8,11


## Saving the clean Data

In [73]:
# save the clean dataframe to a csv file
file_handler.to_csv(train_df, '../data/train.csv')
file_handler.to_csv(store_df, '../data/store.csv')


### Merging the Trian and Store data