In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode
#colors=px.colors.qualitative.Plotly
temp = dict(layout=go.Layout(font=dict(family="Franklin Gothic", size=12), width=1400))
from itertools import cycle

colors = cycle(px.colors.sequential.Viridis + px.colors.sequential.Viridis + px.colors.sequential.Plasma + px.colors.sequential.solar)


# for dirname, _, filenames in os.walk('/kaggle/input/store-sales-time-series-forecasting'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))
        
CUR_DIR= "/kaggle/input/store-sales-time-series-forecasting"

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
%%time

df_oil = pd.read_csv(os.path.join(CUR_DIR, 'oil.csv'), parse_dates=['date'])
df_sample_submission = pd.read_csv(os.path.join(CUR_DIR, 'sample_submission.csv'))
df_holidays_events = pd.read_csv(os.path.join(CUR_DIR, 'holidays_events.csv'), parse_dates=['date'])
df_stores = pd.read_csv(os.path.join(CUR_DIR, 'stores.csv'))
df_train = pd.read_csv(os.path.join(CUR_DIR, 'train.csv'), parse_dates=['date'])
df_test = pd.read_csv(os.path.join(CUR_DIR, 'test.csv'), parse_dates=['date'])
df_transactions = pd.read_csv(os.path.join(CUR_DIR, 'transactions.csv'), parse_dates=['date'])

CPU times: user 1.95 s, sys: 405 ms, total: 2.35 s
Wall time: 3.4 s


In [3]:
def summarize(df, file_name, n_rows_to_show=5):
    """Simply summarize the given DataFrame.
    
    Parameters:
        df: pd.DataFrame, raw DataFrame
        file_name: str, name of the file
        n_rows_to_show: int, number of rows to show 
    """
    print(f"=====Summary of {file_name}=====")
    print(f"\n\nThe data shape and types are:\n")
    display(df.info())
    print(f"\n\nThe column data statistics: \n{df.describe()}\n\n")
    print(f"The number of unique values per column: \n{df.nunique()}\n\n")
    
    
    nan_ratio = pd.isna(df).sum() / len(df) * 100
    nan_ratio.sort_values(ascending=False, inplace=True)
    nan_ratio = nan_ratio.to_frame(name='NaN Ratio').T
    print("NaN ratio:")
    display(nan_ratio)   
    
    display(df.head(n_rows_to_show))

# Train Data

In [6]:
summarize(df_train,"df_train",10)

=====Summary of df_train=====


The data shape and types are:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000888 entries, 0 to 3000887
Data columns (total 6 columns):
 #   Column       Dtype         
---  ------       -----         
 0   id           int64         
 1   date         datetime64[ns]
 2   store_nbr    int64         
 3   family       object        
 4   sales        float64       
 5   onpromotion  int64         
dtypes: datetime64[ns](1), float64(1), int64(3), object(1)
memory usage: 137.4+ MB


None



The column data statistics: 
                 id     store_nbr         sales   onpromotion
count  3.000888e+06  3.000888e+06  3.000888e+06  3.000888e+06
mean   1.500444e+06  2.750000e+01  3.577757e+02  2.602770e+00
std    8.662819e+05  1.558579e+01  1.101998e+03  1.221888e+01
min    0.000000e+00  1.000000e+00  0.000000e+00  0.000000e+00
25%    7.502218e+05  1.400000e+01  0.000000e+00  0.000000e+00
50%    1.500444e+06  2.750000e+01  1.100000e+01  0.000000e+00
75%    2.250665e+06  4.100000e+01  1.958473e+02  0.000000e+00
max    3.000887e+06  5.400000e+01  1.247170e+05  7.410000e+02


The number of unique values per column: 
id             3000888
date              1684
store_nbr           54
family              33
sales           379610
onpromotion        362
dtype: int64


NaN ratio:


Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
NaN Ratio,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,1,2013-01-01,1,BABY CARE,0.0,0
2,2,2013-01-01,1,BEAUTY,0.0,0
3,3,2013-01-01,1,BEVERAGES,0.0,0
4,4,2013-01-01,1,BOOKS,0.0,0
5,5,2013-01-01,1,BREAD/BAKERY,0.0,0
6,6,2013-01-01,1,CELEBRATION,0.0,0
7,7,2013-01-01,1,CLEANING,0.0,0
8,8,2013-01-01,1,DAIRY,0.0,0
9,9,2013-01-01,1,DELI,0.0,0


=====Summary of df_holidays_events=====


The data shape and types are:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   date         350 non-null    datetime64[ns]
 1   type         350 non-null    object        
 2   locale       350 non-null    object        
 3   locale_name  350 non-null    object        
 4   description  350 non-null    object        
 5   transferred  350 non-null    bool          
dtypes: bool(1), datetime64[ns](1), object(4)
memory usage: 14.1+ KB


None



The column data statistics: 
                       date     type    locale locale_name description  \
count                   350      350       350         350         350   
unique                  312        6         3          24         103   
top     2014-06-25 00:00:00  Holiday  National     Ecuador    Carnaval   
freq                      4      221       174         174          10   
first   2012-03-02 00:00:00      NaN       NaN         NaN         NaN   
last    2017-12-26 00:00:00      NaN       NaN         NaN         NaN   

       transferred  
count          350  
unique           2  
top          False  
freq           338  
first          NaN  
last           NaN  


The number of unique values per column: 
date           312
type             6
locale           3
locale_name     24
description    103
transferred      2
dtype: int64


NaN ratio:






Unnamed: 0,date,type,locale,locale_name,description,transferred
NaN Ratio,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,date,type,locale,locale_name,description,transferred
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False


In [54]:
summarize(df_stores,"df_stores")

=====Summary of df_stores=====


The data shape and types are:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54 entries, 0 to 53
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   store_nbr  54 non-null     int64 
 1   city       54 non-null     object
 2   state      54 non-null     object
 3   type       54 non-null     object
 4   cluster    54 non-null     int64 
dtypes: int64(2), object(3)
memory usage: 2.2+ KB


None



The column data statistics: 
       store_nbr    cluster
count  54.000000  54.000000
mean   27.500000   8.481481
std    15.732133   4.693395
min     1.000000   1.000000
25%    14.250000   4.000000
50%    27.500000   8.500000
75%    40.750000  13.000000
max    54.000000  17.000000


The number of unique values per column: 
store_nbr    54
city         22
state        16
type          5
cluster      17
dtype: int64


NaN ratio:


Unnamed: 0,store_nbr,city,state,type,cluster
NaN Ratio,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4


In [55]:
summarize(df_transactions,"df_transactions")

=====Summary of df_transactions=====


The data shape and types are:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83488 entries, 0 to 83487
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          83488 non-null  datetime64[ns]
 1   store_nbr     83488 non-null  int64         
 2   transactions  83488 non-null  int64         
dtypes: datetime64[ns](1), int64(2)
memory usage: 1.9 MB


None



The column data statistics: 
          store_nbr  transactions
count  83488.000000  83488.000000
mean      26.939237   1694.602158
std       15.608204    963.286644
min        1.000000      5.000000
25%       13.000000   1046.000000
50%       27.000000   1393.000000
75%       40.000000   2079.000000
max       54.000000   8359.000000


The number of unique values per column: 
date            1682
store_nbr         54
transactions    4993
dtype: int64


NaN ratio:


Unnamed: 0,date,store_nbr,transactions
NaN Ratio,0.0,0.0,0.0


Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922


In [56]:
train_date=df_train.date.unique()
train_date

array(['2013-01-01T00:00:00.000000000', '2013-01-02T00:00:00.000000000',
       '2013-01-03T00:00:00.000000000', ...,
       '2017-08-13T00:00:00.000000000', '2017-08-14T00:00:00.000000000',
       '2017-08-15T00:00:00.000000000'], dtype='datetime64[ns]')

In [5]:
train_date=df_train.date.unique()
total_sales=df_train.groupby('date')['sales'].sum().div(100)
promotion=df_train.groupby('date')['onpromotion'].sum()
avg_sales=df_train.groupby('date')['sales'].mean()


fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=train_date,
        y= total_sales,
        name = "total sales", mode='lines'
    ))

fig.add_trace(
    go.Scatter(
        x=train_date,
        y= promotion,
        name = "promotionL", mode='lines'
    ))

# fig.add_trace(
#     go.Bar(
#         x=train_date,
#         y=promotion,
#         name = "promotion"
#     ))


fig.update_xaxes(rangeslider_visible= True,
                 rangeselector=dict(
                     buttons=list([
                         dict(count=6, label="6m", step="month", stepmode="backward"),
                         dict(count=1, label="1y", step="year", stepmode="backward"),
                         dict(count=2, label="2y", step="year", stepmode="backward"),
                         dict(count=4, label="4y", step="year", stepmode="backward"),
                         dict(step="all")])))
fig.update_layout(template=temp,title='Total Sales/100  and Items on Promotion', 
                  autosize=False, 
                  width=1400, 
                  height=700, 
                  xaxis_title="Date", 
                  yaxis_title="",
                  hovermode='x unified',
                  showlegend=True)
fig.show()

In [None]:
buttons

In [47]:
train_date=df_train.date.unique()
family = df_train.family.unique().tolist()
family.insert(0, 'All')
total_sales=df_train.groupby('date')['sales'].sum()
promotion=df_train.groupby('date')['onpromotion'].sum()
#avg_sales=df_train.groupby('date')['sales'].mean()

buttons=[]



fig = go.Figure()

for i in range (len(family)):
    if i != 0:
        
        total_sales=df_train[df_train.family == family[i]].groupby('date')['sales'].sum()
        #promotion= df_train[df_train.family == family[i]].groupby('date')['onpromotion'].sum()
        
    fig.add_trace(go.Scatter(x=train_date, y= total_sales,name = str(family[i]) + " total sales", mode='lines' ,marker_color = next(colors) ))
    #fig.add_trace(go.Scatter(x=train_date,y= promotion,name = str(family[i]) + " promotion", mode='lines'))
    
    visibility=[False]*len(family)
    visibility[i]=True
    
    button = dict(label = family[i],
                  method = "update",
                  args=[{"visible": visibility}]
                 )
    buttons.append(button)


# fig.add_trace(
#     go.Scatter(
#         x=train_date,
#         y= promotion,
#         name = "promotionL", mode='lines'
#     ))

# fig.add_trace(
#     go.Bar(
#         x=train_date,
#         y=promotion,
#         name = "promotion"
#     ))


fig.update_xaxes(rangeslider_visible= True,
                 rangeselector=dict(
                     buttons=list([
                         dict(count=6, label="6m", step="month", stepmode="backward"),
                         dict(count=1, label="1y", step="year", stepmode="backward"),
                         dict(count=2, label="2y", step="year", stepmode="backward"),
                         dict(count=4, label="4y", step="year", stepmode="backward"),
                         dict(step="all")]), xanchor='left',yanchor='bottom', y=1.16, x=.01))

fig.update_layout(template=temp,title='Total Sales per family', 
                  #autosize=False, 
                  width=1400, 
                  height=700, 
                  xaxis_title="Date", 
                  yaxis_title="",
                  hovermode='x unified',
                  showlegend=True,
                  updatemenus=[
                      dict(
                          type="dropdown",
                          active=0,
                          buttons=buttons,
                          xanchor='left',yanchor='bottom', y=1.01, x=.01,)],
                  
                 )
fig.show()

# df_oil

In [4]:
summarize(df_oil,"df_oil")

=====Summary of df_oil=====


The data shape and types are:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1218 entries, 0 to 1217
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   date        1218 non-null   datetime64[ns]
 1   dcoilwtico  1175 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 19.2 KB


None



The column data statistics: 
        dcoilwtico
count  1175.000000
mean     67.714366
std      25.630476
min      26.190000
25%      46.405000
50%      53.190000
75%      95.660000
max     110.620000


The number of unique values per column: 
date          1218
dcoilwtico     998
dtype: int64


NaN ratio:


Unnamed: 0,dcoilwtico,date
NaN Ratio,3.530378,0.0


Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


In [7]:
df= df_oil
fig = px.line(df, x="date", y="dcoilwtico")
fig.update_xaxes(rangeslider_visible=True)
fig.show()

In [4]:
new_train= pd.merge(df_train,df_oil,how="left",on=["date"])

In [5]:
summarize(new_train,"new_train")

=====Summary of new_train=====


The data shape and types are:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3000888 entries, 0 to 3000887
Data columns (total 7 columns):
 #   Column       Dtype         
---  ------       -----         
 0   id           int64         
 1   date         datetime64[ns]
 2   store_nbr    int64         
 3   family       object        
 4   sales        float64       
 5   onpromotion  int64         
 6   dcoilwtico   float64       
dtypes: datetime64[ns](1), float64(2), int64(3), object(1)
memory usage: 183.2+ MB


None



The column data statistics: 
                 id     store_nbr         sales   onpromotion    dcoilwtico
count  3.000888e+06  3.000888e+06  3.000888e+06  3.000888e+06  2.072466e+06
mean   1.500444e+06  2.750000e+01  3.577757e+02  2.602770e+00  6.792559e+01
std    8.662819e+05  1.558579e+01  1.101998e+03  1.221888e+01  2.566633e+01
min    0.000000e+00  1.000000e+00  0.000000e+00  0.000000e+00  2.619000e+01
25%    7.502218e+05  1.400000e+01  0.000000e+00  0.000000e+00  4.638000e+01
50%    1.500444e+06  2.750000e+01  1.100000e+01  0.000000e+00  5.333000e+01
75%    2.250665e+06  4.100000e+01  1.958473e+02  0.000000e+00  9.580000e+01
max    3.000887e+06  5.400000e+01  1.247170e+05  7.410000e+02  1.106200e+02


The number of unique values per column: 
id             3000888
date              1684
store_nbr           54
family              33
sales           379610
onpromotion        362
dcoilwtico         994
dtype: int64


NaN ratio:


Unnamed: 0,dcoilwtico,id,date,store_nbr,family,sales,onpromotion
NaN Ratio,30.938242,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,dcoilwtico
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,
1,1,2013-01-01,1,BABY CARE,0.0,0,
2,2,2013-01-01,1,BEAUTY,0.0,0,
3,3,2013-01-01,1,BEVERAGES,0.0,0,
4,4,2013-01-01,1,BOOKS,0.0,0,


In [11]:
train_date=new_train.date.unique()
family = new_train.family.unique().tolist()
family.insert(0, 'All')
total_sales=new_train.groupby('date')['sales'].sum()/1000
#promotion=new_train.groupby('date')['onpromotion'].sum()
#avg_sales=df_train.groupby('date')['sales'].mean()
oil= df_oil.dcoilwtico

buttons=[]



fig = go.Figure()

for i in range (len(family)):
    if i != 0:
        
        total_sales=df_train[new_train.family == family[i]].groupby('date')['sales'].sum()/1000
        #promotion= df_train[df_train.family == family[i]].groupby('date')['onpromotion'].sum()
        
    fig.add_trace(go.Scatter(x=train_date, y= total_sales,name = str(family[i]) + " total sales", mode='lines' ,marker_color = next(colors) ))
    #fig.add_trace(go.Scatter(x=train_date,y= promotion,name = str(family[i]) + " promotion", mode='lines'))
    
    visibility=[False]*len(family)
    visibility[i]=True
    
    button = dict(label = family[i],
                  method = "update",
                  args=[{"visible": visibility}]
                 )
    buttons.append(button)


fig.add_trace(
    go.Scatter(
        x=train_date,
        y= oil,
        name = "oil", mode='lines'
    ))

# fig.add_trace(
#     go.Bar(
#         x=train_date,
#         y=promotion,
#         name = "promotion"
#     ))


fig.update_xaxes(rangeslider_visible= True,
                 rangeselector=dict(
                     buttons=list([
                         dict(count=6, label="6m", step="month", stepmode="backward"),
                         dict(count=1, label="1y", step="year", stepmode="backward"),
                         dict(count=2, label="2y", step="year", stepmode="backward"),
                         dict(count=4, label="4y", step="year", stepmode="backward"),
                         dict(step="all")]), xanchor='left',yanchor='bottom', y=1.16, x=.01))

fig.update_layout(template=temp,title='Total Sales/100 per family', 
                  #autosize=False, 
                  width=1400, 
                  height=700, 
                  xaxis_title="Date", 
                  yaxis_title="",
                  hovermode='x unified',
                  showlegend=True,
                  updatemenus=[
                      dict(
                          type="dropdown",
                          active=0,
                          buttons=buttons,
                          xanchor='left',yanchor='bottom', y=1.01, x=.01,)],
                  
                 )
fig.show()


Boolean Series key will be reindexed to match DataFrame index.



Note the relationshiop between produce, poultry, dairy,bread, beverages on a wholistic view.

# df_Holiday_events

In [4]:
summarize(df_holidays_events,"df_holidays_events")

=====Summary of df_holidays_events=====


The data shape and types are:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   date         350 non-null    datetime64[ns]
 1   type         350 non-null    object        
 2   locale       350 non-null    object        
 3   locale_name  350 non-null    object        
 4   description  350 non-null    object        
 5   transferred  350 non-null    bool          
dtypes: bool(1), datetime64[ns](1), object(4)
memory usage: 14.1+ KB


None



The column data statistics: 
                       date     type    locale locale_name description  \
count                   350      350       350         350         350   
unique                  312        6         3          24         103   
top     2014-06-25 00:00:00  Holiday  National     Ecuador    Carnaval   
freq                      4      221       174         174          10   
first   2012-03-02 00:00:00      NaN       NaN         NaN         NaN   
last    2017-12-26 00:00:00      NaN       NaN         NaN         NaN   

       transferred  
count          350  
unique           2  
top          False  
freq           338  
first          NaN  
last           NaN  


The number of unique values per column: 
date           312
type             6
locale           3
locale_name     24
description    103
transferred      2
dtype: int64


NaN ratio:






Unnamed: 0,date,type,locale,locale_name,description,transferred
NaN Ratio,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,date,type,locale,locale_name,description,transferred
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False


In [18]:
df_holidays_events["type"].unique()

array(['Holiday', 'Transfer', 'Additional', 'Bridge', 'Work Day', 'Event'],
      dtype=object)

In [6]:
df_holidays_events["locale"].unique()

array(['Local', 'Regional', 'National'], dtype=object)

In [None]:
df_holidays_events[df_holidays_events["type"]=='Work Day']


In [5]:
df_holidays_events.drop(columns=['locale_name', 'description', 'transferred'],inplace = True)

In [6]:
new_train= pd.merge(new_train,df_holidays_events,how="left",on=["date"])

In [8]:
summarize(new_train, "new train")

=====Summary of new train=====


The data shape and types are:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3054348 entries, 0 to 3054347
Data columns (total 9 columns):
 #   Column       Dtype         
---  ------       -----         
 0   id           int64         
 1   date         datetime64[ns]
 2   store_nbr    int64         
 3   family       object        
 4   sales        float64       
 5   onpromotion  int64         
 6   dcoilwtico   float64       
 7   type         object        
 8   locale       object        
dtypes: datetime64[ns](1), float64(2), int64(3), object(3)
memory usage: 233.0+ MB


None



The column data statistics: 
                 id     store_nbr         sales   onpromotion    dcoilwtico
count  3.054348e+06  3.054348e+06  3.054348e+06  3.054348e+06  2.099196e+06
mean   1.504277e+06  2.750000e+01  3.590209e+02  2.617480e+00  6.801587e+01
std    8.662610e+05  1.558579e+01  1.107286e+03  1.225494e+01  2.569134e+01
min    0.000000e+00  1.000000e+00  0.000000e+00  0.000000e+00  2.619000e+01
25%    7.546768e+05  1.400000e+01  0.000000e+00  0.000000e+00  4.641000e+01
50%    1.507572e+06  2.750000e+01  1.100000e+01  0.000000e+00  5.343000e+01
75%    2.255120e+06  4.100000e+01  1.960110e+02  0.000000e+00  9.581000e+01
max    3.000887e+06  5.400000e+01  1.247170e+05  7.410000e+02  1.106200e+02


The number of unique values per column: 
id             3000888
date              1684
store_nbr           54
family              33
sales           379610
onpromotion        362
dcoilwtico         994
type                 6
locale               3
dtype: int64


NaN ratio:


Unnamed: 0,type,locale,dcoilwtico,id,date,store_nbr,family,sales,onpromotion
NaN Ratio,83.547258,83.547258,31.271879,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,dcoilwtico,type,locale
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,,Holiday,National
1,1,2013-01-01,1,BABY CARE,0.0,0,,Holiday,National
2,2,2013-01-01,1,BEAUTY,0.0,0,,Holiday,National
3,3,2013-01-01,1,BEVERAGES,0.0,0,,Holiday,National
4,4,2013-01-01,1,BOOKS,0.0,0,,Holiday,National


How to create a bargraph that plots a bar if 

In [32]:
holidays= df_holidays_events.groupby('date')['type'].count()
holidays

date
2012-03-02    1
2012-04-01    1
2012-04-12    1
2012-04-14    1
2012-04-21    1
             ..
2017-12-22    2
2017-12-23    1
2017-12-24    1
2017-12-25    1
2017-12-26    1
Name: type, Length: 312, dtype: int64

In [28]:
repeated_holidays= holidays.loc[lambda x: x>1].index.format()
repeated_holidays

['2012-06-25',
 '2012-07-03',
 '2012-12-22',
 '2012-12-24',
 '2012-12-31',
 '2013-05-12',
 '2013-06-25',
 '2013-07-03',
 '2013-12-22',
 '2014-06-25',
 '2014-07-03',
 '2014-12-22',
 '2014-12-26',
 '2015-06-25',
 '2015-07-03',
 '2015-12-22',
 '2016-04-21',
 '2016-05-01',
 '2016-05-07',
 '2016-05-08',
 '2016-05-12',
 '2016-06-25',
 '2016-07-03',
 '2016-07-24',
 '2016-11-12',
 '2016-12-22',
 '2017-04-14',
 '2017-06-25',
 '2017-07-03',
 '2017-12-08',
 '2017-12-22']

In [27]:
len(repeated_holidays)

31

In [23]:
#df_holidays_events = pd.read_csv(os.path.join(CUR_DIR, 'holidays_events.csv'), parse_dates=['date'])

In [25]:
df_holidays_events.loc[df_holidays_events["date"].isin(holidays.loc[lambda x: x>1].index.format())].head(15)

Unnamed: 0,date,type,locale,locale_name,description,transferred
7,2012-06-25,Holiday,Regional,Imbabura,Provincializacion de Imbabura,False
8,2012-06-25,Holiday,Local,Latacunga,Cantonizacion de Latacunga,False
9,2012-06-25,Holiday,Local,Machala,Fundacion de Machala,False
10,2012-07-03,Holiday,Local,Santo Domingo,Fundacion de Santo Domingo,False
11,2012-07-03,Holiday,Local,El Carmen,Cantonizacion de El Carmen,False
32,2012-12-22,Holiday,Local,Salinas,Cantonizacion de Salinas,False
33,2012-12-22,Additional,National,Ecuador,Navidad-3,False
35,2012-12-24,Bridge,National,Ecuador,Puente Navidad,False
36,2012-12-24,Additional,National,Ecuador,Navidad-1,False
39,2012-12-31,Bridge,National,Ecuador,Puente Primer dia del ano,False


In [30]:
df_holidays_events.loc[df_holidays_events["locale_name"] == "Latacunga"]

Unnamed: 0,date,type,locale,locale_name,description,transferred
8,2012-06-25,Holiday,Local,Latacunga,Cantonizacion de Latacunga,False
26,2012-11-11,Holiday,Local,Latacunga,Independencia de Latacunga,False
60,2013-06-25,Holiday,Local,Latacunga,Cantonizacion de Latacunga,False
79,2013-11-11,Holiday,Local,Latacunga,Independencia de Latacunga,False
110,2014-06-25,Holiday,Local,Latacunga,Cantonizacion de Latacunga,False
142,2014-11-11,Holiday,Local,Latacunga,Independencia de Latacunga,False
178,2015-06-25,Holiday,Local,Latacunga,Cantonizacion de Latacunga,False
196,2015-11-11,Holiday,Local,Latacunga,Independencia de Latacunga,False
260,2016-06-25,Holiday,Local,Latacunga,Cantonizacion de Latacunga,False
281,2016-11-11,Holiday,Local,Latacunga,Independencia de Latacunga,False


In [15]:
holidays2= df_holidays_events.groupby('date')['type'].nunique()
holidays2

date
2012-03-02    1
2012-04-01    1
2012-04-12    1
2012-04-14    1
2012-04-21    1
             ..
2017-12-22    2
2017-12-23    1
2017-12-24    1
2017-12-25    1
2017-12-26    1
Name: type, Length: 312, dtype: int64

In [13]:
holidays2.index.duplicated()

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [12]:
holidays3 = holidays2[~holidays2.index.duplicated()]
print(len(holidays2))
print(len(holidays3))


312
312


In [17]:
train_date=new_train.date.unique()
family = new_train.family.unique().tolist()
family.insert(0, 'All')
total_sales=new_train.groupby('date')['sales'].sum()/1000
#promotion=new_train.groupby('date')['onpromotion'].sum()
#avg_sales=df_train.groupby('date')['sales'].mean()
holidays= df_holidays_events.type.count()* 10

buttons=[]



fig = go.Figure()

for i in range (len(family)):
    if i != 0:
        
        total_sales=df_train[new_train.family == family[i]].groupby('date')['sales'].sum()/1000
        #promotion= df_train[df_train.family == family[i]].groupby('date')['onpromotion'].sum()
        
    fig.add_trace(go.Scatter(x=train_date, y= total_sales,name = str(family[i]) + " total sales", mode='lines' ,marker_color = next(colors) ))
    #fig.add_trace(go.Scatter(x=train_date,y= promotion,name = str(family[i]) + " promotion", mode='lines'))
    
    visibility=[False]*len(family)
    visibility[i]=True
    
    button = dict(label = family[i],
                  method = "update",
                  args=[{"visible": visibility}]
                 )
    buttons.append(button)


# fig.add_trace(
#     go.Scatter(
#         x=train_date,
#         y= oil,
#         name = "oil", mode='lines'
#     ))

fig.add_trace(
    go.Bar(
        x=train_date,
        y=holidays,
        name = "holiday"
    ))


fig.update_xaxes(rangeslider_visible= True,
                 rangeselector=dict(
                     buttons=list([
                         dict(count=6, label="6m", step="month", stepmode="backward"),
                         dict(count=1, label="1y", step="year", stepmode="backward"),
                         dict(count=2, label="2y", step="year", stepmode="backward"),
                         dict(count=4, label="4y", step="year", stepmode="backward"),
                         dict(step="all")]), xanchor='left',yanchor='bottom', y=1.16, x=.01))

fig.update_layout(template=temp,title='Total Sales/100 per family', 
                  #autosize=False, 
                  width=1400, 
                  height=700, 
                  xaxis_title="Date", 
                  yaxis_title="",
                  hovermode='x unified',
                  showlegend=True,
                  updatemenus=[
                      dict(
                          type="dropdown",
                          active=0,
                          buttons=buttons,
                          xanchor='left',yanchor='bottom', y=1.01, x=.01,)],
                  
                 )
fig.show()


Boolean Series key will be reindexed to match DataFrame index.



ValueError: 
    Invalid value of type 'numpy.int64' received for the 'y' property of bar
        Received value: 3500

    The 'y' property is an array that may be specified as a tuple,
    list, numpy array, or pandas Series

# *To do list*
1. Merge Datasets
2. Barchart of best performing families with time component
3. line Graph of all families together with correct legend
4. Add days and months as new features