# Black Friday Dataset

#### Cleaning and preparing the data for model training

In [1]:
# Basic Imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

### Customer Purchase Behavior Analysis for **ABC Private Limited**

**Context:**

A retail company **<span style="color:red;">“ABC Private Limited”</span>** wants to understand the <span style="color:red;">**customer purchase behavior**</span> (specifically, <span style="color:red;">**purchase amount**</span>) against various products of different categories. They have shared the <span style="color:red;">**purchase summary**</span> of various customers for selected <span style="color:red;">**high volume products**</span> from last month. The dataset also contains:

- <span style="color:red;">**Customer Demographics**</span>:
  - <span style="color:black;">**Age**</span>
  - <span style="color:black;">**Gender**</span>
  - <span style="color:black;">**Marital Status**</span>
  - <span style="color:black;">**City Type**</span>
  - <span style="color:black;">**Stay in Current City**</span>

- <span style="color:black;">**Product Details**</span>:
  - <span style="color:black;">**Product ID**</span>
  - <span style="color:black;">**Product Category**</span>

- <span style="color:red;">**Total Purchase Amount**</span> from last month

**Objective:**

**<span style="color:red;">ABC Private Limited</span>** aims to build a <span style="color:red;">**model**</span> to <span style="color:red;">**predict the purchase amount**</span> of customers against various products. This model will help them to create <span style="color:red;">**personalized offers**</span> for customers against different products.


### Analysis Goals

- Understand the <span style="color:red;">**customer purchase behavior**</span>
- Identify patterns in <span style="color:red;">**purchase amount**</span> based on demographics and product details
- Build a predictive model to forecast <span style="color:red;">**purchase amount**</span>
- Develop <span style="color:red;">**personalized offers**</span> for customers


In [2]:
# Import our 1st DF

df_train = pd.read_csv('../Data/Black Friday/train.csv')
df_train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [3]:
# Import our 2nd DF

df_test = pd.read_csv('../Data/Black Friday/test.csv')
df_test.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
0,1000004,P00128942,M,46-50,7,B,2,1,1,11.0,
1,1000009,P00113442,M,26-35,17,C,0,0,3,5.0,
2,1000010,P00288442,F,36-45,1,B,4+,1,5,14.0,
3,1000010,P00145342,F,36-45,1,B,4+,1,4,9.0,
4,1000011,P00053842,F,26-35,1,C,1,0,4,5.0,12.0


In [4]:
df = pd.merge(df_test, df_train, how='left')

In [5]:
df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000004,P00128942,M,46-50,7,B,2,1,1,11.0,,
1,1000009,P00113442,M,26-35,17,C,0,0,3,5.0,,
2,1000010,P00288442,F,36-45,1,B,4+,1,5,14.0,,
3,1000010,P00145342,F,36-45,1,B,4+,1,4,9.0,,
4,1000011,P00053842,F,26-35,1,C,1,0,4,5.0,12.0,


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 233599 entries, 0 to 233598
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   User_ID                     233599 non-null  int64  
 1   Product_ID                  233599 non-null  object 
 2   Gender                      233599 non-null  object 
 3   Age                         233599 non-null  object 
 4   Occupation                  233599 non-null  int64  
 5   City_Category               233599 non-null  object 
 6   Stay_In_Current_City_Years  233599 non-null  object 
 7   Marital_Status              233599 non-null  int64  
 8   Product_Category_1          233599 non-null  int64  
 9   Product_Category_2          161255 non-null  float64
 10  Product_Category_3          71037 non-null   float64
 11  Purchase                    0 non-null       float64
dtypes: float64(3), int64(4), object(5)
memory usage: 21.4+ MB


In [7]:
df.describe()

Unnamed: 0,User_ID,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
count,233599.0,233599.0,233599.0,233599.0,161255.0,71037.0,0.0
mean,1003029.0,8.085407,0.41007,5.276542,9.849586,12.669454,
std,1726.505,6.521146,0.491847,3.73638,5.094943,4.125944,
min,1000001.0,0.0,0.0,1.0,2.0,3.0,
25%,1001527.0,2.0,0.0,1.0,5.0,9.0,
50%,1003070.0,7.0,0.0,5.0,9.0,14.0,
75%,1004477.0,14.0,1.0,8.0,15.0,16.0,
max,1006040.0,20.0,1.0,18.0,18.0,18.0,


In [8]:
# Drop user `User_ID` it's of no use

df.drop(['User_ID'], axis=1, inplace=True)

In [9]:
df.head()

Unnamed: 0,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,P00128942,M,46-50,7,B,2,1,1,11.0,,
1,P00113442,M,26-35,17,C,0,0,3,5.0,,
2,P00288442,F,36-45,1,B,4+,1,5,14.0,,
3,P00145342,F,36-45,1,B,4+,1,4,9.0,,
4,P00053842,F,26-35,1,C,1,0,4,5.0,12.0,


In [10]:
# Here Male, Female are in (Alphabets) we have to covert them into numbers.

In [11]:
# HAndling categorical feature Gender

df['Gender'] = df['Gender'].map({'F':0,'M':1})
df.head()

Unnamed: 0,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,P00128942,1,46-50,7,B,2,1,1,11.0,,
1,P00113442,1,26-35,17,C,0,0,3,5.0,,
2,P00288442,0,36-45,1,B,4+,1,5,14.0,,
3,P00145342,0,36-45,1,B,4+,1,4,9.0,,
4,P00053842,0,26-35,1,C,1,0,4,5.0,12.0,


In [12]:
df['Age'].unique()

array(['46-50', '26-35', '36-45', '18-25', '51-55', '55+', '0-17'],
      dtype=object)

In [13]:
# Coverted Age to numerical
# 1st Technique

df['Age']=df['Age'].map({'0-17':1,'18-25':2,'26-35':3,'36-45':4,'46-50':5,'51-55':6,'55+':7})

In [14]:
df.head()

Unnamed: 0,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,P00128942,1,5,7,B,2,1,1,11.0,,
1,P00113442,1,3,17,C,0,0,3,5.0,,
2,P00288442,0,4,1,B,4+,1,5,14.0,,
3,P00145342,0,4,1,B,4+,1,4,9.0,,
4,P00053842,0,3,1,C,1,0,4,5.0,12.0,


In [15]:
##second technqiue

from sklearn import preprocessing
 
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
 
# Encode labels in column 'species'.
df['Age']= label_encoder.fit_transform(df['Age'])
 
df['Age'].unique()

array([4, 2, 3, 1, 5, 6, 0])

In [16]:
df.head()

Unnamed: 0,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,P00128942,1,4,7,B,2,1,1,11.0,,
1,P00113442,1,2,17,C,0,0,3,5.0,,
2,P00288442,0,3,1,B,4+,1,5,14.0,,
3,P00145342,0,3,1,B,4+,1,4,9.0,,
4,P00053842,0,2,1,C,1,0,4,5.0,12.0,


In [19]:
df_city = pd.get_dummies(df['City_Category'], drop_first=True)

In [20]:
df_city.head()

Unnamed: 0,B,C
0,True,False
1,False,True
2,True,False
3,True,False
4,False,True
