## Data Cleaning

### Importing Libraries

In [149]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [151]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pylab as plt
from sklearn.preprocessing import StandardScaler
import joblib

### Importing Data

In [190]:
df = pd.read_csv('../data/customer_segmentation_data.csv', index_col=0)

### Data Cleaning

#### Missing Data

In [192]:
df.isnull().sum()

age                     0
gender                  0
income                  0
spending_score          0
membership_years        0
purchase_frequency      0
preferred_category      0
last_purchase_amount    0
dtype: int64

In [171]:
# As discovered in the exploratory data analysis there are no missing data.

#### Duplicated Data

In [194]:
df.duplicated().sum()

0

In [175]:
# There are no duplicated customer records.

#### Scaling Data

In [196]:
scaler = StandardScaler() 

df[['Scaledage','Scaledincome','Scaledspending_score','Scaledpurchase_frequency','Scaledlast_purchase_amount','Scaledmembership_years']] = scaler.fit_transform(df[['age','income','spending_score','purchase_frequency','last_purchase_amount','membership_years']]) 

In [198]:
df.drop(['age','income','spending_score','purchase_frequency','last_purchase_amount','membership_years'], inplace=True, axis=1)

In [200]:
df.head()

Unnamed: 0_level_0,gender,preferred_category,Scaledage,Scaledincome,Scaledspending_score,Scaledpurchase_frequency,Scaledlast_purchase_amount,Scaledmembership_years
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Female,Groceries,-0.384644,0.316868,1.358468,-0.182348,-1.28154,-0.86501
2,Female,Sports,-1.515362,-0.282016,0.321865,1.082005,-1.523763,-1.215358
3,Female,Clothing,1.078639,1.112778,-0.714738,0.09862,-0.230005,-1.215358
4,Other,Home & Garden,-0.251618,-1.210096,0.805613,-1.516943,1.69008,1.23708
5,Female,Electronics,1.411203,1.523374,-1.025718,-0.112106,-0.491443,-0.86501


In [202]:
joblib.dump(scaler, '../assets/scaler.gz')

['../assets/scaler.gz']

#### Converting Categorical Variables

In [204]:
df = pd.get_dummies(data=df,columns=['gender','preferred_category'],drop_first=True)

In [206]:
df.head()

Unnamed: 0_level_0,Scaledage,Scaledincome,Scaledspending_score,Scaledpurchase_frequency,Scaledlast_purchase_amount,Scaledmembership_years,gender_Male,gender_Other,preferred_category_Electronics,preferred_category_Groceries,preferred_category_Home & Garden,preferred_category_Sports
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,-0.384644,0.316868,1.358468,-0.182348,-1.28154,-0.86501,False,False,False,True,False,False
2,-1.515362,-0.282016,0.321865,1.082005,-1.523763,-1.215358,False,False,False,False,False,True
3,1.078639,1.112778,-0.714738,0.09862,-0.230005,-1.215358,False,False,False,False,False,False
4,-0.251618,-1.210096,0.805613,-1.516943,1.69008,1.23708,False,True,False,False,True,False
5,1.411203,1.523374,-1.025718,-0.112106,-0.491443,-0.86501,False,False,True,False,False,False


### Saving Cleaned Data

In [208]:
df.to_csv('../assets/cleaned_data.csv', index=False)