# Cleaning and Merging the Rossman Datasets

As all good Python programs start, we will first import our packages

In [58]:
import pandas as pd

In [59]:
import numpy as np

## Cleaning Store Dataset first

In [60]:
store = pd.read_csv('store.csv')

There are three rows where all Competition values, i.e there is no competition distance nor any dates 
for competition opening. We will make the assumption that these stores have NEVER had competition

In [61]:
store[store['CompetitionDistance'].isnull()]

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
290,291,d,a,,,,0,,,
621,622,a,c,,,,0,,,
878,879,d,a,,,,1,5.0,2013.0,"Feb,May,Aug,Nov"


There are 351 rows where all CompetitionDistance exists but other competition values are zero.
We will make the assumption that these stores have had competition throughout their lifetime.

In [62]:
store[store['CompetitionOpenSinceMonth'].isnull()][store.CompetitionDistance > 0].head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
11,12,a,c,1070.0,,,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
12,13,d,a,310.0,,,1,45.0,2009.0,"Feb,May,Aug,Nov"
15,16,a,c,3270.0,,,0,,,
18,19,a,c,3240.0,,,1,22.0,2011.0,"Mar,Jun,Sept,Dec"
21,22,a,a,1040.0,,,1,22.0,2012.0,"Jan,Apr,Jul,Oct"


In [63]:
## So, we want a 'Competition Open Date' to signify when the store started to have competition
## We will assume that a store has never gone from 'having' comp to 'not having' comp

## -> if CompDistance is null -> we want the competition date to be in the past
## -> if CompDistance exists, but other comp values are null -> we want comp date to be in the future
## -> otherwise, we want a date valye from the comp columns

idx_of_stores_with_no_competition = store[store['CompetitionDistance'].isnull()].index
idx_of_stores_with_competition_always = store[store['CompetitionOpenSinceMonth'].isnull()][store.CompetitionDistance > 0].index

mask = store.index.isin(idx_of_stores_with_no_competition.append(idx_of_stores_with_competition_always))
idx_of_stores_where_competition_opened = store[~mask].index

print(idx_of_stores_with_no_competition.shape)
print(idx_of_stores_with_competition_always.shape)
print(idx_of_stores_where_competition_opened.shape)


(3,)
(351,)
(761,)


  if __name__ == '__main__':


In [64]:
## check that we divided the index correctly

3+351+761

1115

In [65]:
## Insert dates appropriately

store.loc[idx_of_stores_with_no_competition,'competitionOpenDate'] = '01/01/2050'
store.loc[idx_of_stores_with_competition_always,'competitionOpenDate'] = '01/01/1970'

store['CompetitionOpenSinceMonth'] = store['CompetitionOpenSinceMonth'].fillna(0).astype(int)
store['CompetitionOpenSinceYear'] = store['CompetitionOpenSinceYear'].fillna(0).astype(int)

for index in idx_of_stores_where_competition_opened:
    
    store.at[index,'competitionOpenDate'] = \
        (str(store.at[index,'CompetitionOpenSinceMonth']) + '/15/' + str(store.at[index,'CompetitionOpenSinceYear']))

store['competitionOpenDate'] = pd.to_datetime(store['competitionOpenDate'])

store.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,competitionOpenDate
0,1,c,a,1270.0,9,2008,0,,,,2008-09-15
1,2,a,a,570.0,11,2007,1,13.0,2010.0,"Jan,Apr,Jul,Oct",2007-11-15
2,3,a,a,14130.0,12,2006,1,14.0,2011.0,"Jan,Apr,Jul,Oct",2006-12-15
3,4,c,c,620.0,9,2009,0,,,,2009-09-15
4,5,a,a,29910.0,4,2015,0,,,,2015-04-15


In [66]:
len(store[store['Promo2SinceWeek'].isnull()].index)

544

In [67]:
## So it looks like the stores can be divided into the following two cat:

## Stores that have never run promos (544 of these)
## Stores that started running promos at some stage and now run promos at specific intervals (571 of these)

In [68]:
## Here we find the indexes of the stores with promos and the stores without

idx_of_stores_with_no_promo = store[store['Promo2SinceWeek'].isnull()].index

mask = store.index.isin(idx_of_stores_with_no_promo)
idx_of_stores_with_promos = store[~mask].index
print(len(idx_of_stores_with_no_promo))
print(len(idx_of_stores_with_promos))

544
571


In [69]:
## Here we set the dates that stores launched promos 


store.loc[idx_of_stores_with_no_promo,'promoOpenDate'] = '01/01/2050'

store['Promo2SinceWeek'] = store['Promo2SinceWeek'].fillna(0).astype(int)
store['Promo2SinceYear'] = store['Promo2SinceYear'].fillna(0).astype(int)

for index in idx_of_stores_with_promos:
    store.loc[index,'promoOpenDate'] = (str(min(12,((store.at[index,'Promo2SinceWeek'] // 5)+1))) + '/15/' + str(store.at[index,'Promo2SinceYear']))
    
store['promoOpenDate'] = pd.to_datetime(store['promoOpenDate'])
store.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,competitionOpenDate,promoOpenDate
0,1,c,a,1270.0,9,2008,0,0,0,,2008-09-15,2050-01-01
1,2,a,a,570.0,11,2007,1,13,2010,"Jan,Apr,Jul,Oct",2007-11-15,2010-03-15
2,3,a,a,14130.0,12,2006,1,14,2011,"Jan,Apr,Jul,Oct",2006-12-15,2011-03-15
3,4,c,c,620.0,9,2009,0,0,0,,2009-09-15,2050-01-01
4,5,a,a,29910.0,4,2015,0,0,0,,2015-04-15,2050-01-01


In [70]:
# in StoreType we have '0' 'a' 'b' 'c' and 0 change to include only 0 and ones
store['StoreType'].replace({'a':0, 'b':1, 'c':2,'d':3},inplace=True)
store['Assortment'].replace({'a':0, 'b':1, 'c':2,'d':3},inplace=True)

## Cleaning New Train Dataset

In [71]:
new_train = pd.read_csv('new_train.csv')

In [72]:
new_train.drop("Unnamed: 0", axis=1, inplace=True)

In [73]:
new_train['Date']=pd.to_datetime(new_train['Date'])

In [74]:
# add columns of day month and year as int
new_train['day'] = pd.DatetimeIndex(new_train['Date']).day
new_train['month'] = pd.DatetimeIndex(new_train['Date']).month
new_train['year'] = pd.DatetimeIndex(new_train['Date']).year

In [75]:
# in StateHoliday we have '0' 'a' 'b' 'c' and 0 change to include only 0 and ones
new_train['StateHoliday'].replace({'0':0, 'a':1, 'b':1,'c':1},inplace=True)

## Merging Two Datasets

In [76]:
# So we have the store dataset:
store.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,competitionOpenDate,promoOpenDate
0,1,2,0,1270.0,9,2008,0,0,0,,2008-09-15,2050-01-01
1,2,0,0,570.0,11,2007,1,13,2010,"Jan,Apr,Jul,Oct",2007-11-15,2010-03-15
2,3,0,0,14130.0,12,2006,1,14,2011,"Jan,Apr,Jul,Oct",2006-12-15,2011-03-15
3,4,2,2,620.0,9,2009,0,0,0,,2009-09-15,2050-01-01
4,5,0,0,29910.0,4,2015,0,0,0,,2015-04-15,2050-01-01


In [77]:
## And the sales dataset:
new_train.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,day,month,year
0,1,5,2015-07-31,5263,555,1,1,0,1,31,7,2015
1,2,5,2015-07-31,6064,625,1,1,0,1,31,7,2015
2,3,5,2015-07-31,8314,821,1,1,0,1,31,7,2015
3,4,5,2015-07-31,13995,1498,1,1,0,1,31,7,2015
4,5,5,2015-07-31,4822,559,1,1,0,1,31,7,2015


In [78]:
merged_dataset = new_train.merge(store, how='left', left_on='Store',right_on='Store')
merged_dataset.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,day,...,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,competitionOpenDate,promoOpenDate
0,1,5,2015-07-31,5263,555,1,1,0,1,31,...,0,1270.0,9,2008,0,0,0,,2008-09-15,2050-01-01
1,2,5,2015-07-31,6064,625,1,1,0,1,31,...,0,570.0,11,2007,1,13,2010,"Jan,Apr,Jul,Oct",2007-11-15,2010-03-15
2,3,5,2015-07-31,8314,821,1,1,0,1,31,...,0,14130.0,12,2006,1,14,2011,"Jan,Apr,Jul,Oct",2006-12-15,2011-03-15
3,4,5,2015-07-31,13995,1498,1,1,0,1,31,...,2,620.0,9,2009,0,0,0,,2009-09-15,2050-01-01
4,5,5,2015-07-31,4822,559,1,1,0,1,31,...,0,29910.0,4,2015,0,0,0,,2015-04-15,2050-01-01


In [79]:
## Now we need to assign boolean values for whether the store had competition or not

merged_dataset['Competition'] = (merged_dataset.competitionOpenDate <= merged_dataset.Date).astype(int)
merged_dataset.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,day,...,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,competitionOpenDate,promoOpenDate,Competition
0,1,5,2015-07-31,5263,555,1,1,0,1,31,...,1270.0,9,2008,0,0,0,,2008-09-15,2050-01-01,1
1,2,5,2015-07-31,6064,625,1,1,0,1,31,...,570.0,11,2007,1,13,2010,"Jan,Apr,Jul,Oct",2007-11-15,2010-03-15,1
2,3,5,2015-07-31,8314,821,1,1,0,1,31,...,14130.0,12,2006,1,14,2011,"Jan,Apr,Jul,Oct",2006-12-15,2011-03-15,1
3,4,5,2015-07-31,13995,1498,1,1,0,1,31,...,620.0,9,2009,0,0,0,,2009-09-15,2050-01-01,1
4,5,5,2015-07-31,4822,559,1,1,0,1,31,...,29910.0,4,2015,0,0,0,,2015-04-15,2050-01-01,1


In [80]:
from time import strptime

merged_dataset['PromoInterval'] = merged_dataset['PromoInterval'].fillna('no_promo').astype(str)

def get_month_integers_from_month_strings(month_strings):
    if month_strings == 'no_promo':
        return 0
    else:
        month_array = []
        month_list = month_strings.split(",") 
        for month in month_list:
            if len(month) == 4:
                month_array.append(9)
            else:
                month_array.append(strptime(month,'%b').tm_mon)

        return month_array

merged_dataset['promoMonths'] = merged_dataset['PromoInterval'].apply(get_month_integers_from_month_strings)
merged_dataset.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,day,...,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,competitionOpenDate,promoOpenDate,Competition,promoMonths
0,1,5,2015-07-31,5263,555,1,1,0,1,31,...,9,2008,0,0,0,no_promo,2008-09-15,2050-01-01,1,0
1,2,5,2015-07-31,6064,625,1,1,0,1,31,...,11,2007,1,13,2010,"Jan,Apr,Jul,Oct",2007-11-15,2010-03-15,1,"[1, 4, 7, 10]"
2,3,5,2015-07-31,8314,821,1,1,0,1,31,...,12,2006,1,14,2011,"Jan,Apr,Jul,Oct",2006-12-15,2011-03-15,1,"[1, 4, 7, 10]"
3,4,5,2015-07-31,13995,1498,1,1,0,1,31,...,9,2009,0,0,0,no_promo,2009-09-15,2050-01-01,1,0
4,5,5,2015-07-31,4822,559,1,1,0,1,31,...,4,2015,0,0,0,no_promo,2015-04-15,2050-01-01,1,0


In [None]:
def get_month(date):
    return date.month
    

merged_dataset['month_test'] = merged_dataset['Date'].apply(get_month)
#merged_dataset['Promo_on_this_day'] = merged_dataset[merged_dataset.month_test.isin(merged_dataset.promoMonth)]
#merged_dataset.Promo_on_this_day
merged_dataset.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,day,...,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,competitionOpenDate,promoOpenDate,Competition,promoMonths,month_test
0,1,5,2015-07-31,5263,555,1,1,0,1,31,...,2008,0,0,0,no_promo,2008-09-15,2050-01-01,1,0,7
1,2,5,2015-07-31,6064,625,1,1,0,1,31,...,2007,1,13,2010,"Jan,Apr,Jul,Oct",2007-11-15,2010-03-15,1,"[1, 4, 7, 10]",7
2,3,5,2015-07-31,8314,821,1,1,0,1,31,...,2006,1,14,2011,"Jan,Apr,Jul,Oct",2006-12-15,2011-03-15,1,"[1, 4, 7, 10]",7
3,4,5,2015-07-31,13995,1498,1,1,0,1,31,...,2009,0,0,0,no_promo,2009-09-15,2050-01-01,1,0,7
4,5,5,2015-07-31,4822,559,1,1,0,1,31,...,2015,0,0,0,no_promo,2015-04-15,2050-01-01,1,0,7


In [None]:
## Checking over values to assign 

for index in range(merged_dataset.shape[0]):
    if isinstance(merged_dataset.at[index, 'promoMonths'],list) :
        if (merged_dataset.at[index, 'promoOpenDate'] <= merged_dataset.at[index, 'Date'])\
                & (merged_dataset.at[index, 'month_test'] in merged_dataset.at[index, 'promoMonths'] ):
            merged_dataset.at[index, 'promoMonths'] = 1
    else :
        merged_dataset.at[index,'Promo2'] = 0
    
merged_dataset['Promo2'] = merged_dataset['Promo2'].astype(int)


In [None]:
merged_dataset.tail()


In [None]:
## Clean up

final_dataset = merged_dataset[['Store','DayOfWeek','Date','Sales','Customers','Open','Promo',\
                                'StoreType', 'Assortment','StateHoliday', 'SchoolHoliday', 
                                'StoreType', 'CompetitionDistance',\
                                'Competition','Promo2']]

final_dataset.head()

In [None]:
final_dataset.to_csv(r'cleaned_and_merged_data.csv')