In [72]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as ppl

import ydata_profiling as profiler

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
raw_data = pd.read_csv("data/train.csv")

Item_Identifier ---- Unique product ID

Item_Weight ---- Weight of product

Item_Fat_Content ---- Whether the product is low fat or not

Item_Visibility ---- The % of the total display area of all products in a store allocated to the particular product

Item_Type ---- The category to which the product belongs

Item_MRP ---- Maximum Retail Price (list price) of the product

Outlet_Identifier ---- Unique store ID

Outlet_Establishment_Year ---- The year in which the store was established

Outlet_Size ---- The size of the store in terms of ground area covered

Outlet_Location_Type ---- The type of city in which the store is located

*Outlet_Type ---- Whether the outlet is just a grocery store or some sort of supermarket

Item_Outlet_Sales ---- sales of the product in t particular store. This is the outcome variable to be predicted.


# Data Exploring

In [3]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


## Dealing with missing

In [4]:
cnt = raw_data.shape[0]
missing = pd.Series()
for col in raw_data:
    n = raw_data[col].loc[raw_data[col].notna() == False].size
    if n != cnt:
        missing[col] = n
missing.loc[missing != 0]

Item_Weight    1463
Outlet_Size    2410
dtype: int64

In [5]:
raw_data.groupby(['Outlet_Identifier']).count()

Unnamed: 0_level_0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
Outlet_Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
OUT010,555,555,555,555,555,555,555,0,555,555,555
OUT013,932,932,932,932,932,932,932,932,932,932,932
OUT017,926,926,926,926,926,926,926,0,926,926,926
OUT018,928,928,928,928,928,928,928,928,928,928,928
OUT019,528,0,528,528,528,528,528,528,528,528,528
OUT027,935,0,935,935,935,935,935,935,935,935,935
OUT035,930,930,930,930,930,930,930,930,930,930,930
OUT045,929,929,929,929,929,929,929,0,929,929,929
OUT046,930,930,930,930,930,930,930,930,930,930,930
OUT049,930,930,930,930,930,930,930,930,930,930,930


### Outlet size

In [6]:
missing = raw_data.loc[raw_data['Outlet_Size'].notna() == False]
present = raw_data.loc[raw_data['Outlet_Size'].notna() == True]

In [7]:
missing.groupby(['Outlet_Identifier']).count()

Unnamed: 0_level_0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
Outlet_Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
OUT010,555,555,555,555,555,555,555,0,555,555,555
OUT017,926,926,926,926,926,926,926,0,926,926,926
OUT045,929,929,929,929,929,929,929,0,929,929,929


Outlet size is completely missing on every occurrence

Filling by new category

In [8]:
raw_data.Outlet_Size.unique()

array(['Medium', nan, 'High', 'Small'], dtype=object)

In [9]:
raw_data.Outlet_Size = raw_data.Outlet_Size.fillna('undefined')

In [10]:
raw_data.Outlet_Size.unique()

array(['Medium', 'undefined', 'High', 'Small'], dtype=object)

### Item weight

In [11]:
missing = raw_data.loc[raw_data['Item_Weight'].notna() == False]
present = raw_data.loc[raw_data['Item_Weight'].notna() == True]

In [12]:
missing.Item_Identifier.unique()

array(['FDP10', 'DRI11', 'FDW12', ..., 'FDQ58', 'DRG13', 'NCN18'],
      dtype=object)

Searching for records from other outlets and filling where it is possible.

In [13]:
for id in missing.Item_Identifier.unique():
    found_row = raw_data.query(f'Item_Identifier == "{id}"')
    found_row = found_row[found_row['Item_Weight'].isna() == False]
    filler_weight = found_row.Item_Weight.median()
    raw_data.loc[raw_data['Item_Identifier'] == id] = raw_data.loc[raw_data['Item_Identifier'] == id].fillna(filler_weight)

In [14]:
still_missing = raw_data.loc[raw_data['Item_Weight'].isna()].Item_Identifier.unique()
still_missing

array(['FDN52', 'FDK57', 'FDE52', 'FDQ60'], dtype=object)

Filling the no-clues ones

In [15]:
median_weight_by_type = raw_data.loc[:, ['Item_Weight', 'Item_Type']].groupby('Item_Type').median()
median_weight_by_type

Unnamed: 0_level_0,Item_Weight
Item_Type,Unnamed: 1_level_1
Baking Goods,11.65
Breads,10.6
Breakfast,10.695
Canned,12.35
Dairy,13.35
Frozen Foods,12.85
Fruits and Vegetables,13.1
Hard Drinks,9.8975
Health and Hygiene,12.15
Household,13.5


In [16]:
for id in still_missing:
    missing_element = raw_data.loc[raw_data['Item_Identifier'] == id]
    new_weight = median_weight_by_type.loc[missing_element.Item_Type.iloc[0]].iloc[0]
    raw_data.loc[raw_data['Item_Identifier'] == id] = raw_data.loc[raw_data['Item_Identifier'] == id].fillna(new_weight)

In [17]:
raw_data.loc[raw_data['Item_Weight'].isna()]

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales


In [18]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                8523 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                8523 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


### So, data is full now

## Dealing with categorical & numerical

### Separating columns

In [20]:
raw_data.dtypes.sort_values()

Outlet_Establishment_Year      int64
Item_Weight                  float64
Item_Visibility              float64
Item_MRP                     float64
Item_Outlet_Sales            float64
Item_Identifier               object
Item_Fat_Content              object
Item_Type                     object
Outlet_Identifier             object
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
dtype: object

### Finding categorical & encoding

In [71]:
for col in raw_data.dtypes[raw_data.dtypes == object].index:
    print(raw_data.loc[:, col].unique())

['FDA15' 'DRC01' 'FDN15' ... 'NCF55' 'NCW30' 'NCW05']
['Low Fat' 'Regular']
['Dairy' 'Soft Drinks' 'Meat' 'Fruits and Vegetables' 'Household'
 'Baking Goods' 'Snack Foods' 'Frozen Foods' 'Breakfast'
 'Health and Hygiene' 'Hard Drinks' 'Canned' 'Breads' 'Starchy Foods'
 'Others' 'Seafood']
['OUT049' 'OUT018' 'OUT010' 'OUT013' 'OUT027' 'OUT045' 'OUT017' 'OUT046'
 'OUT035' 'OUT019']
['Medium' 'undefined' 'High' 'Small']
['Tier 1' 'Tier 3' 'Tier 2']
['Supermarket Type1' 'Supermarket Type2' 'Grocery Store'
 'Supermarket Type3']


In [70]:
raw_data.replace({'low fat': 'Low Fat', 'LF': 'Low Fat', 'reg': 'Regular'}, inplace=True)

Only fat content is problematic

## Generating auto-profiler

In [74]:
prof = profiler.ProfileReport(raw_data, title='Raw Data')
prof.to_notebook_iframe()

Summarize dataset: 100%|██████████| 46/46 [00:08<00:00,  5.72it/s, Completed]                                                   
Generate report structure: 100%|██████████| 1/1 [00:06<00:00,  6.43s/it]
Render HTML: 100%|██████████| 1/1 [00:02<00:00,  2.09s/it]


## Analysis

In [75]:
data = raw_data[['Outlet_Identifier', 'Item_Identifier', 'Item_Outlet_Sales', 'Item_Visibility', 'Item_MRP', 'Item_Weight', 'Item_Type', 'Item_Fat_Content', 'Outlet_Establishment_Year', 'Outlet_Type', 'Outlet_Size', 'Outlet_Location_Type']]

In [76]:
goods = data[['Item_Identifier', 'Outlet_Identifier', 'Item_Outlet_Sales', 'Item_Visibility', 'Item_MRP', 'Item_Weight', 'Item_Type', 'Item_Fat_Content']].sort_values('Item_Identifier')

In [77]:
outlets = data[['Outlet_Identifier', 'Outlet_Establishment_Year', 'Outlet_Type', 'Outlet_Size', 'Outlet_Location_Type']].groupby("Outlet_Identifier").first()
outlets

Unnamed: 0_level_0,Outlet_Establishment_Year,Outlet_Type,Outlet_Size,Outlet_Location_Type
Outlet_Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
OUT010,1998,Grocery Store,undefined,Tier 3
OUT013,1987,Supermarket Type1,High,Tier 3
OUT017,2007,Supermarket Type1,undefined,Tier 2
OUT018,2009,Supermarket Type2,Medium,Tier 3
OUT019,1985,Grocery Store,Small,Tier 1
OUT027,1985,Supermarket Type3,Medium,Tier 3
OUT035,2004,Supermarket Type1,Small,Tier 2
OUT045,2002,Supermarket Type1,undefined,Tier 2
OUT046,1997,Supermarket Type1,Small,Tier 1
OUT049,1999,Supermarket Type1,Medium,Tier 1


In [80]:
def get_outlet_info(key):
    return goods.loc[goods['Outlet_Identifier'] == key].drop(['Outlet_Identifier'], axis=1).sort_index()

In [81]:
get_outlet_info('OUT049')

Unnamed: 0,Item_Identifier,Item_Outlet_Sales,Item_Visibility,Item_MRP,Item_Weight,Item_Type,Item_Fat_Content
0,FDA15,3735.1380,0.016047,249.8092,9.300,Dairy,Low Fat
2,FDN15,2097.2700,0.016760,141.6180,17.500,Meat,Low Fat
10,FDY07,1516.0266,0.000000,45.5402,11.800,Fruits and Vegetables,Low Fat
12,FDX32,1589.2646,0.100014,145.4786,15.100,Fruits and Vegetables,Regular
17,FDP49,718.3982,0.069196,54.3614,9.000,Breakfast,Regular
...,...,...,...,...,...,...,...
8451,FDK21,3254.4304,0.010028,249.6408,7.905,Snack Foods,Low Fat
8467,FDV31,3881.6140,0.000000,175.2370,9.800,Fruits and Vegetables,Low Fat
8470,FDW27,1551.3140,0.151088,155.1314,5.860,Meat,Regular
8475,NCS17,378.1744,0.080627,92.5436,18.600,Health and Hygiene,Low Fat
