# Approaching ML Problems 
Understand the business requirements and the nature of the available data.

Classify the problem as supervised/unsupervised and regression/classification.

Download, clean & explore the data and create new features that may improve models.

Create training/test/validation sets and prepare the data for training ML models.

Create a quick & easy baseline model to evaluate and benchmark future models.

Pick a modeling strategy, train a model, and tune hyperparameters to achieve optimal fit.

Experiment and combine results from multiple strategies to get a better result.

Interpret models, study individual predictions, and present your findings.


In [1]:
! pip install numpy pandas-profiling matplotlib plotly seaborn

In [2]:
! pip install opendatasets

In [2]:
import os
# import jovian
import matplotlib
# import opendatasets as od
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline

sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

In [3]:
ross_df=pd.read_csv('train.csv',low_memory=False)
ross_df

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Id
0,1,5,2015-07-17,4852,519,1,1,0,0,303445
1,2,5,2015-07-17,4518,495,1,1,0,1,959585
2,3,5,2015-07-17,6679,673,1,1,0,1,739744
3,4,5,2015-07-17,10514,1343,1,1,0,1,864001
4,5,5,2015-07-17,4355,513,1,1,0,1,981931
...,...,...,...,...,...,...,...,...,...,...
1001594,1111,2,2013-01-01,0,0,0,0,a,1,225066
1001595,1112,2,2013-01-01,0,0,0,0,a,1,775592
1001596,1113,2,2013-01-01,0,0,0,0,a,1,135205
1001597,1114,2,2013-01-01,0,0,0,0,a,1,954751


In [5]:
store_df= pd.read_csv('store.csv',low_memory=False)
store_df

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,c,c,620.0,9.0,2009.0,0,,,
4,5,a,a,29910.0,4.0,2015.0,0,,,
...,...,...,...,...,...,...,...,...,...,...
1110,1111,a,a,1900.0,6.0,2014.0,1,31.0,2013.0,"Jan,Apr,Jul,Oct"
1111,1112,c,c,1880.0,4.0,2006.0,0,,,
1112,1113,a,c,9260.0,,,0,,,
1113,1114,a,c,870.0,,,0,,,


In [6]:
merge_df = pd.merge(ross_df, store_df, on='Store', how='left')
merge_df

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Id,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,5,2015-07-17,4852,519,1,1,0,0,303445,c,a,1270.0,9.0,2008.0,0,,,
1,2,5,2015-07-17,4518,495,1,1,0,1,959585,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,5,2015-07-17,6679,673,1,1,0,1,739744,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,5,2015-07-17,10514,1343,1,1,0,1,864001,c,c,620.0,9.0,2009.0,0,,,
4,5,5,2015-07-17,4355,513,1,1,0,1,981931,a,a,29910.0,4.0,2015.0,0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1001594,1111,2,2013-01-01,0,0,0,0,a,1,225066,a,a,1900.0,6.0,2014.0,1,31.0,2013.0,"Jan,Apr,Jul,Oct"
1001595,1112,2,2013-01-01,0,0,0,0,a,1,775592,c,c,1880.0,4.0,2006.0,0,,,
1001596,1113,2,2013-01-01,0,0,0,0,a,1,135205,a,c,9260.0,,,0,,,
1001597,1114,2,2013-01-01,0,0,0,0,a,1,954751,a,c,870.0,,,0,,,


In [9]:
merge_df.shape
merge_df=merge_df.drop(columns=['Id'])

In [15]:
test_df = pd.read_csv('test.csv', low_memory=False)
merge_test_df = pd.merge(test_df, store_df, on='Store', how='left')

In [13]:
merge_df

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,5,2015-07-17,4852,519,1,1,0,0,c,a,1270.0,9.0,2008.0,0,,,
1,2,5,2015-07-17,4518,495,1,1,0,1,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,5,2015-07-17,6679,673,1,1,0,1,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,5,2015-07-17,10514,1343,1,1,0,1,c,c,620.0,9.0,2009.0,0,,,
4,5,5,2015-07-17,4355,513,1,1,0,1,a,a,29910.0,4.0,2015.0,0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1001594,1111,2,2013-01-01,0,0,0,0,a,1,a,a,1900.0,6.0,2014.0,1,31.0,2013.0,"Jan,Apr,Jul,Oct"
1001595,1112,2,2013-01-01,0,0,0,0,a,1,c,c,1880.0,4.0,2006.0,0,,,
1001596,1113,2,2013-01-01,0,0,0,0,a,1,a,c,9260.0,,,0,,,
1001597,1114,2,2013-01-01,0,0,0,0,a,1,a,c,870.0,,,0,,,


# Cleaning Data

In [19]:
merge_df.info()
merge_df.describe().round(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001599 entries, 0 to 1001598
Data columns (total 18 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   Store                      1001599 non-null  int64  
 1   DayOfWeek                  1001599 non-null  int64  
 2   Date                       1001599 non-null  object 
 3   Sales                      1001599 non-null  int64  
 4   Customers                  1001599 non-null  int64  
 5   Open                       1001599 non-null  int64  
 6   Promo                      1001599 non-null  int64  
 7   StateHoliday               1001599 non-null  object 
 8   SchoolHoliday              1001599 non-null  int64  
 9   StoreType                  1001599 non-null  object 
 10  Assortment                 1001599 non-null  object 
 11  CompetitionDistance        998999 non-null   float64
 12  CompetitionOpenSinceMonth  683207 non-null   float64
 13  CompetitionO

Unnamed: 0,Store,DayOfWeek,Sales,Customers,Open,Promo,SchoolHoliday,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear
count,1001599.0,1001599.0,1001599.0,1001599.0,1001599.0,1001599.0,1001599.0,998999.0,683207.0,683207.0,1001599.0,501184.0,501184.0
mean,558.44,4.0,5770.57,633.3,0.83,0.38,0.17,5430.48,7.22,2008.69,0.5,23.26,2011.75
std,321.91,2.0,3852.57,464.94,0.38,0.49,0.38,7716.19,3.21,5.99,0.5,14.1,1.66
min,1.0,1.0,0.0,0.0,0.0,0.0,0.0,20.0,1.0,1900.0,0.0,1.0,2009.0
25%,280.0,2.0,3722.0,404.0,1.0,0.0,0.0,710.0,4.0,2006.0,0.0,13.0,2011.0
50%,558.0,4.0,5741.0,610.0,1.0,0.0,0.0,2330.0,8.0,2010.0,1.0,22.0,2012.0
75%,838.0,6.0,7854.0,838.0,1.0,1.0,0.0,6890.0,10.0,2013.0,1.0,37.0,2013.0
max,1115.0,7.0,41551.0,7388.0,1.0,1.0,1.0,75860.0,12.0,2015.0,1.0,50.0,2015.0


In [20]:
merge_df.duplicated().sum()

np.int64(0)

In [21]:
merge_df['Date']= pd.to_datetime(merge_df['Date'])
merge_test_df['Date']= pd.to_datetime(merge_test_df['Date'])

# EDA