## Data Cleaning

### Importing Libraries

In [5]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [147]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pylab as plt
from sklearn.preprocessing import MinMaxScaler 
import joblib

### Importing Data

In [101]:
df = pd.read_csv('../data/customer_segmentation_data.csv')

### Data Cleaning

#### Drop ID Column

In [103]:
df.drop(['id'], inplace=True, axis=1)

#### Missing Data

In [105]:
df.isnull().sum()

age                     0
gender                  0
income                  0
spending_score          0
membership_years        0
purchase_frequency      0
preferred_category      0
last_purchase_amount    0
dtype: int64

In [107]:
# As discovered in the exploratory data analysis there are no missing data.

#### Duplicated Data

In [110]:
df.duplicated().sum()

0

In [112]:
# There are no duplicated customer records.

#### Scaling Data

In [115]:
scaler = MinMaxScaler() 

df[['Scaledage','Scaledincome','Scaledspending_score','Scaledpurchase_frequency','Scaledlast_purchase_amount']] = scaler.fit_transform(df[['age','income','spending_score','purchase_frequency','last_purchase_amount']]) 

In [117]:
df.drop(['age','income','spending_score','purchase_frequency','last_purchase_amount'], inplace=True, axis=1)

In [119]:
df.head()

Unnamed: 0,gender,membership_years,preferred_category,Scaledage,Scaledincome,Scaledspending_score,Scaledpurchase_frequency,Scaledlast_purchase_amount
0,Female,3,Groceries,0.392157,0.577966,0.89899,0.469388,0.104241
1,Female,2,Sports,0.058824,0.407172,0.59596,0.836735,0.03187
2,Female,2,Clothing,0.823529,0.80495,0.292929,0.55102,0.41842
3,Other,9,Home & Garden,0.431373,0.142495,0.737374,0.081633,0.992106
4,Female,3,Electronics,0.921569,0.922047,0.20202,0.489796,0.340308


In [145]:
joblib.dump(scaler, '../assets/scaler.gz')

['../assets/scaler.gz']

#### Converting Categorical Variables

In [121]:
def cat_var(data_frame, column):
    sub_dummies = pd.get_dummies(data=data_frame[column],drop_first=True)
    data_frame = pd.concat([data_frame.drop(column,axis=1),sub_dummies],axis=1)
    return data_frame

In [123]:
df = cat_var(df, ['gender','preferred_category'])

In [125]:
df.head()

Unnamed: 0,membership_years,Scaledage,Scaledincome,Scaledspending_score,Scaledpurchase_frequency,Scaledlast_purchase_amount,gender_Male,gender_Other,preferred_category_Electronics,preferred_category_Groceries,preferred_category_Home & Garden,preferred_category_Sports
0,3,0.392157,0.577966,0.89899,0.469388,0.104241,False,False,False,True,False,False
1,2,0.058824,0.407172,0.59596,0.836735,0.03187,False,False,False,False,False,True
2,2,0.823529,0.80495,0.292929,0.55102,0.41842,False,False,False,False,False,False
3,9,0.431373,0.142495,0.737374,0.081633,0.992106,False,True,False,False,True,False
4,3,0.921569,0.922047,0.20202,0.489796,0.340308,False,False,True,False,False,False


In [131]:
sub_dummies = pd.get_dummies(data=df['membership_years'],drop_first=True)

In [135]:
df = pd.concat([df.drop('membership_years',axis=1),sub_dummies],axis=1)

In [137]:
df.head()

Unnamed: 0,Scaledage,Scaledincome,Scaledspending_score,Scaledpurchase_frequency,Scaledlast_purchase_amount,gender_Male,gender_Other,preferred_category_Electronics,preferred_category_Groceries,preferred_category_Home & Garden,preferred_category_Sports,2,3,4,5,6,7,8,9,10
0,0.392157,0.577966,0.89899,0.469388,0.104241,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False
1,0.058824,0.407172,0.59596,0.836735,0.03187,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False
2,0.823529,0.80495,0.292929,0.55102,0.41842,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False
3,0.431373,0.142495,0.737374,0.081633,0.992106,False,True,False,False,True,False,False,False,False,False,False,False,False,True,False
4,0.921569,0.922047,0.20202,0.489796,0.340308,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False


### Saving Cleaned Data

In [139]:
df.to_csv('../assets/cleaned_data.csv', index=False)