In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train_data = pd.read_csv('train.csv', parse_dates=['date'])
train_data = train_data.set_index('id')
train_data.head()

Unnamed: 0_level_0,date,store_nbr,family,sales,onpromotion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,2013-01-01,1,BABY CARE,0.0,0
2,2013-01-01,1,BEAUTY,0.0,0
3,2013-01-01,1,BEVERAGES,0.0,0
4,2013-01-01,1,BOOKS,0.0,0


In [3]:
test_data = pd.read_csv('test.csv', parse_dates=['date'])
test_data = test_data.set_index('id')
test_data

Unnamed: 0_level_0,date,store_nbr,family,onpromotion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3000888,2017-08-16,1,AUTOMOTIVE,0
3000889,2017-08-16,1,BABY CARE,0
3000890,2017-08-16,1,BEAUTY,2
3000891,2017-08-16,1,BEVERAGES,20
3000892,2017-08-16,1,BOOKS,0
...,...,...,...,...
3029395,2017-08-31,9,POULTRY,1
3029396,2017-08-31,9,PREPARED FOODS,0
3029397,2017-08-31,9,PRODUCE,1
3029398,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,9


In [4]:
stores_data = pd.read_csv('stores.csv')
stores_data.sample(5)

Unnamed: 0,store_nbr,city,state,type,cluster
30,31,Babahoyo,Los Rios,B,10
32,33,Quevedo,Los Rios,C,3
1,2,Quito,Pichincha,D,13
31,32,Guayaquil,Guayas,C,3
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4


In [5]:
oil_data = pd.read_csv('oil.csv', parse_dates=['date'])
oil_data = oil_data.fillna(oil_data['dcoilwtico'].median())
oil_data.head()

Unnamed: 0,date,dcoilwtico
0,2013-01-01,53.19
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


In [6]:
holidays_events_data = pd.read_csv('holidays_events.csv')
holidays_events_data.sample(5)

Unnamed: 0,date,type,locale,locale_name,description,transferred
174,2015-05-24,Holiday,National,Ecuador,Batalla de Pichincha,False
271,2016-08-24,Holiday,Local,Ambato,Fundacion de Ambato,False
283,2016-11-12,Work Day,National,Ecuador,Recupero Puente Dia de Difuntos,False
330,2017-10-07,Holiday,Local,Quevedo,Cantonizacion de Quevedo,False
67,2013-08-10,Holiday,National,Ecuador,Primer Grito de Independencia,False


# Feature engineering

In [7]:
train_data.head()

Unnamed: 0_level_0,date,store_nbr,family,sales,onpromotion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,2013-01-01,1,BABY CARE,0.0,0
2,2013-01-01,1,BEAUTY,0.0,0
3,2013-01-01,1,BEVERAGES,0.0,0
4,2013-01-01,1,BOOKS,0.0,0


In [8]:
test_data.head()

Unnamed: 0_level_0,date,store_nbr,family,onpromotion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3000888,2017-08-16,1,AUTOMOTIVE,0
3000889,2017-08-16,1,BABY CARE,0
3000890,2017-08-16,1,BEAUTY,2
3000891,2017-08-16,1,BEVERAGES,20
3000892,2017-08-16,1,BOOKS,0


In [9]:
train_data = train_data.merge(oil_data, how='left', on='date')
test_data = test_data.merge(oil_data, how='left', on='date')

In [10]:
train_data

Unnamed: 0,date,store_nbr,family,sales,onpromotion,dcoilwtico
0,2013-01-01,1,AUTOMOTIVE,0.000,0,53.19
1,2013-01-01,1,BABY CARE,0.000,0,53.19
2,2013-01-01,1,BEAUTY,0.000,0,53.19
3,2013-01-01,1,BEVERAGES,0.000,0,53.19
4,2013-01-01,1,BOOKS,0.000,0,53.19
...,...,...,...,...,...,...
3000883,2017-08-15,9,POULTRY,438.133,0,47.57
3000884,2017-08-15,9,PREPARED FOODS,154.553,1,47.57
3000885,2017-08-15,9,PRODUCE,2419.729,148,47.57
3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8,47.57


In [11]:
dates = train_data.date.unique()
map_time = {k:v for v,k in enumerate(dates)}

In [12]:
X_train = train_data.copy().drop('sales', axis=1)
y_train = train_data.copy()['sales']
X_train['date'] = train_data['date'].map(map_time)
X_train.tail(5)

Unnamed: 0,date,store_nbr,family,onpromotion,dcoilwtico
3000883,1683,9,POULTRY,0,47.57
3000884,1683,9,PREPARED FOODS,1,47.57
3000885,1683,9,PRODUCE,148,47.57
3000886,1683,9,SCHOOL AND OFFICE SUPPLIES,8,47.57
3000887,1683,9,SEAFOOD,0,47.57


In [13]:
dates_test = test_data.date.unique()
map_time_test = {k:v+1684 for v,k in enumerate(dates_test)}
X_test = test_data.copy()
X_test['date'] = X_test['date'].map(map_time_test)
X_test.head(5)

Unnamed: 0,date,store_nbr,family,onpromotion,dcoilwtico
0,1684,1,AUTOMOTIVE,0,46.8
1,1684,1,BABY CARE,0,46.8
2,1684,1,BEAUTY,2,46.8
3,1684,1,BEVERAGES,20,46.8
4,1684,1,BOOKS,0,46.8


In [14]:
list(X_train.family.unique()) == list(X_test.family.unique())

True

In [15]:
families = X_train.family.unique()
map_families = {k:v for v,k in enumerate(families)}
#map_families
X_train['family'] = train_data['family'].map(map_families)
X_test['family'] = X_test['family'].map(map_families)

In [16]:
X_train['dcoilwtico'] = X_train['dcoilwtico'].fillna(X_train['dcoilwtico'].mean())

In [17]:
X_test['dcoilwtico'] = X_test['dcoilwtico'].fillna(X_test['dcoilwtico'].mean())

# ML

In [18]:
from sklearn.linear_model import LinearRegression

In [19]:
LR = LinearRegression()
LR.fit(X_train,y_train)

LinearRegression()

In [20]:
y_pred = LR.predict(X_test)

In [21]:
y_pred

array([358.95127797, 347.98756345, 413.43957725, ...,  89.70288036,
       384.40207915,  29.56758713])

# Submission

In [22]:
sub = pd.read_csv('sample_submission.csv')
sub.head()

Unnamed: 0,id,sales
0,3000888,0.0
1,3000889,0.0
2,3000890,0.0
3,3000891,0.0
4,3000892,0.0


In [23]:
sub['sales'] = y_pred
sub

Unnamed: 0,id,sales
0,3000888,358.951278
1,3000889,347.987563
2,3000890,413.439577
3,3000891,1090.217418
4,3000892,315.096420
...,...,...
28507,3029395,111.630309
28508,3029396,62.458731
28509,3029397,89.702880
28510,3029398,384.402079


In [24]:
len(y_pred)

28512

In [25]:
sub.to_csv('submission1.csv', index=False)

In [26]:
sub1 = pd.read_csv('submission1.csv')
sub1

Unnamed: 0,id,sales
0,3000888,358.951278
1,3000889,347.987563
2,3000890,413.439577
3,3000891,1090.217418
4,3000892,315.096420
...,...,...
28507,3029395,111.630309
28508,3029396,62.458731
28509,3029397,89.702880
28510,3029398,384.402079
