In [None]:
import os

def scale_input_data(scale_factor):
  file_bases = ['./input/items', './input/transactions', './input/stores', './input/oil']
  for file_base in file_bases:
    import pandas as pd
    import shutil
    if scale_factor == 1.0:
      shutil.copyfile(file_base + '.csv', file_base + '.scaled.csv')
      continue
    df_to_scale = pd.read_csv(file_base + '.csv')
    new_num_rows = int(scale_factor * len(df_to_scale))
    if scale_factor <= 1.0:
      df_to_scale = df_to_scale.iloc[:new_num_rows]
    else:
      while len(df_to_scale) < new_num_rows:
        df_to_scale = pd.concat([df_to_scale, df_to_scale[:min(new_num_rows - len(df_to_scale), len(df_to_scale))]])
    df_to_scale.to_csv(file_base + '.scaled.csv', index=False)

default_nrows = 25000000
if 'INPUT_SCALE_FACTOR' in os.environ:
  scale_factor = float(os.environ['INPUT_SCALE_FACTOR'])
  scale_input_data(scale_factor)
  nrows = int(scale_factor * default_nrows)
  with open('./input/data.txt', 'w') as file:
    file.write(str(nrows))
elif os.path.exists('./input/data.txt'):
  with open('./input/data.txt', 'r') as file:
    try:
      nrows = int(file.read().strip())
    except:
      nrows = default_nrows
else:
  nrows = default_nrows

**Introduction**
As part of this challenge we are trying to predict sales of various items sold by Favorita retailer. Following are the datasets provided to us:
*  train.csv
* stores.csv
* transactions.csv
* items.csv
* holidays_events.csv
* oils.csv

**Description of Dataset**
Please find link to dataset explaination here[https://www.kaggle.com/c/favorita-grocery-sales-forecasting/data](http://www.kaggle.com/c/favorita-grocery-sales-forecasting/data)


* The first step is to read the data from various datasets and carry out some basic analysis. We use pandas read_csv to read the data set.
* The second step is to analyse the dataset by means of various graphs and try to understand /co-relate with different datasets we are given. 
* The third step is to carry out feature engineering whereby we identify some of the key features from the dataset.
* The fourth and final step is to train the model and test it. May be we could do K Means??

**I. Reading various data from input**

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
exec(os.environ['IREWR_IMPORTS'])
import matplotlib.pyplot as plt

# FIRST-AUTHOR: remove ML code
# import xgboost as xgb

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

# FIRST-AUTHOR: remove path printing
# from subprocess import check_output
# print(check_output(["ls", "./input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [2]:
#Since training data is huge, so I am planning to read few millions of rows from the csv file.
train = pd.read_csv("./input/train.csv", nrows=nrows, parse_dates=['date'],index_col='id')

#print the last 10 rows of the data, this will help us to think what we can dow with the data.
train.tail(5)

  train = pd.read_csv("./input/train.csv", nrows=25000000, parse_dates=['date'],index_col='id')


Unnamed: 0_level_0,date,store_nbr,item_nbr,unit_sales,onpromotion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
24999995,2014-06-07,7,413987,12.0,False
24999996,2014-06-07,7,414305,13.0,False
24999997,2014-06-07,7,414353,21.0,False
24999998,2014-06-07,7,414354,4.0,False
24999999,2014-06-07,7,414421,6.0,False


Training data and items csv are some way or other way related to each other. So I think lets read the items csv and try to merge with training data which will help us in getting more insight from the data.

In [3]:
items = pd.read_csv("./input/items.scaled.csv")

In [4]:
train_items = pd.merge(train, items, how='inner')
train_items.tail(5)

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion,family,class,perishable
24999995,2014-06-06,45,1303141,1.0,False,BREAD/BAKERY,2714,1
24999996,2014-06-06,48,1303141,3.0,False,BREAD/BAKERY,2714,1
24999997,2014-06-06,51,1303141,5.0,False,BREAD/BAKERY,2714,1
24999998,2014-06-07,3,1303141,9.0,False,BREAD/BAKERY,2714,1
24999999,2014-06-07,5,1303141,8.0,False,BREAD/BAKERY,2714,1


After merging two sets of data (training and items) we can now carry out analysis of items sold.

In [5]:
#Lets find out most popular item ordered by people across the 6 millions rows we have read.
#We will group by item_nbr and add the unit sales.
df = train_items['unit_sales'].groupby(train_items['item_nbr']).sum()
#In order to find top 10 popular items we will sort the numpy array and pick the top 10 from
#the list.
df = df.sort_values()
df_highest = df.nlargest(n=10)

#Plot the highest list of items.
# FIRST-AUTHOR: remove plotting
# df_highest.plot(kind='bar',figsize = (10,10),  title = "Top 10 items sold across all stores")
# plt.show()

In [6]:
#Next we find lowest/less demand product. We use nsmallest to find the bottom 10 items,
#probably it doesn;t matter even if we stock them.
df_lowest = df.nsmallest(n=10)
# FIRST-AUTHOR: remove plotting
# df_lowest.plot(kind='bar',figsize = (10,10),  title = "Bottom 10 items sold")
# plt.show()

In [7]:
#Next we could find out popular items in a given year. This will be useful to find out 
#if there were any new items introduced in the recent times.
#In order to do that we need to covert the date field into python date format and then
# extract various fields from it.

train_items['date'] = pd.to_datetime(train_items['date'], format='%Y-%m-%d')
train_items['day_item_purchased'] = train_items['date'].dt.day
train_items['month_item_purchased'] =train_items['date'].dt.month
train_items['quarter_item_purchased'] = train_items['date'].dt.quarter
train_items['year_item_purchased'] = train_items['date'].dt.year

In [8]:
train_items.drop('date', axis=1, inplace=True)

In [9]:
#Lets print out new training dataset
print (train_items.tail(2))

          store_nbr  item_nbr  unit_sales onpromotion        family  class  \
24999998          3   1303141         9.0       False  BREAD/BAKERY   2714   
24999999          5   1303141         8.0       False  BREAD/BAKERY   2714   

          perishable  day_item_purchased  month_item_purchased  \
24999998           1                   7                     6   
24999999           1                   7                     6   

          quarter_item_purchased  year_item_purchased  
24999998                       2                 2014  
24999999                       2                 2014  


In [10]:
df_year = train_items.groupby(['quarter_item_purchased', 'item_nbr'])['unit_sales'].sum()
df_year = df_year.sort_values()
df_year_highest = df_year.nlargest(n=10)
#Plot the highest list of items.
# FIRST-AUTHOR: remove plotting
# df_year_highest.plot(kind='bar',figsize = (10,10),  title = "Top items sold Quarterly")
# plt.show()

In [11]:
# FIRST-AUTHOR: remove plotting
# plt.figure(figsize=(9,10))
df_items = train_items.groupby(['family'])['unit_sales'].sum()
df_items = df_items.sort_values()
df_items_highest = df_items.nlargest(n=10)
# FIRST-AUTHOR: remove plotting
# plt.pie(df_items_highest, labels=df_items_highest.index,shadow=False,autopct='%1.1f%%')
# plt.tight_layout()
# plt.show()
_ = df_items_highest.index


In [12]:
grocery_info = train_items.loc[train_items['family'] == 'GROCERY I']

In [13]:
# FIRST-AUTHOR: remove plotting
# plt.figure(figsize=(12,12))
# #print (grocery_info.tail(2))
# plt.plot(grocery_info['day_item_purchased'],grocery_info['unit_sales'])
# plt.show()
_ = grocery_info['day_item_purchased']
_ = grocery_info['unit_sales']

In [14]:
# FIRST-AUTHOR: remove plotting
# plt.figure(figsize=(9,10))
df_items = train_items.groupby(['family','perishable'])['unit_sales'].sum()
df_items = df_items.sort_values()
df_items_perish_highest = df_items.nlargest(n=10)
# FIRST-AUTHOR: remove plotting
# plt.pie(df_items_perish_highest, labels=df_items_perish_highest.index,shadow=False,autopct='%1.1f%%')
# plt.tight_layout()
# plt.show()
_ = df_items_perish_highest.index

Lets read the transactions data and carry out analysis.

In [15]:
transaction = pd.read_csv("./input/transactions.scaled.csv")

Convert date to pandas data time format, so that  we can group items for a given time frame (monthly, yearly, quarterly)

In [16]:
transaction['date'] = pd.to_datetime(transaction['date'], format='%Y-%m-%d')
transaction['day_item_purchased'] = transaction['date'].dt.day
transaction['month_item_purchased'] =transaction['date'].dt.month
transaction['quarter_item_purchased'] = transaction['date'].dt.quarter
transaction['year_item_purchased'] = transaction['date'].dt.year
print (transaction.tail(2))

            date  store_nbr  transactions  day_item_purchased  \
83486 2017-08-15         53           932                  15   
83487 2017-08-15         54           802                  15   

       month_item_purchased  quarter_item_purchased  year_item_purchased  
83486                     8                       3                 2017  
83487                     8                       3                 2017  


In [17]:
# FIRST-AUTHOR: remove plotting
# plt.figure(figsize=(25,25))
# plt.plot(transaction['date'],transaction['transactions'])
# plt.show()
_ = transaction['date']
_ = transaction['transactions']


In [18]:
# FIRST-AUTHOR: remove plotting
# plt.figure(figsize=(8,12))
trans_day = transaction['transactions'].groupby(transaction['year_item_purchased']).sum()
# FIRST-AUTHOR: remove plotting
# trans_day.plot(kind='bar')
# plt.show()

In [19]:
stores = pd.read_csv("./input/stores.scaled.csv")
print (stores.head())

   store_nbr           city                           state type  cluster
0          1          Quito                       Pichincha    D       13
1          2          Quito                       Pichincha    D       13
2          3          Quito                       Pichincha    D        8
3          4          Quito                       Pichincha    D        9
4          5  Santo Domingo  Santo Domingo de los Tsachilas    D        4


In [20]:
#Lets find out number of cities in each state, which in nothing but finding out number of stores in each
#in each state.
df = stores['city'].groupby(stores['state']).count()
# FIRST-AUTHOR: remove plotting
# df.plot(kind='bar', figsize = (12,8), yticks=np.arange(min(df), max(df)+1, 1.0), title = "Number of cities in each state")
# plt.show()
_ = min(df)
_ = max(df)

In [21]:
#Looks like onpromotion field is always NaN, if so we will get rid of that columns 
#from the training data
print(train['onpromotion'].notnull().any())
train_new=train.drop('onpromotion',axis=1)
print(train_new.tail(5))

True
               date  store_nbr  item_nbr  unit_sales
id                                                  
24999995 2014-06-07          7    413987        12.0
24999996 2014-06-07          7    414305        13.0
24999997 2014-06-07          7    414353        21.0
24999998 2014-06-07          7    414354         4.0
24999999 2014-06-07          7    414421         6.0


In [22]:
oils = pd.read_csv("./input/oil.scaled.csv")
oils['date'] = pd.to_datetime(oils['date'], format='%Y-%m-%d')
oils['day_item_purchased'] = oils['date'].dt.day
oils['month_item_purchased'] =oils['date'].dt.month
oils['quarter_item_purchased'] = oils['date'].dt.quarter
oils['year_item_purchased'] = oils['date'].dt.year

In [23]:
# FIRST-AUTHOR: remove plotting
# plt.figure(figsize=(25,25))
# #trans_day = transaction['transactions'].groupby(transaction['year_item_purchased']).sum()
# plt.plot(oils['date'],oils['dcoilwtico'])
# #trans_day.plot(kind='bar')
# plt.show()
_ = oils['date']
_ = oils['dcoilwtico']