In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools import add_constant
import joblib

In [2]:
# load dataset
df = pd.read_csv("../data/raw/Mall_Customers.csv")

In [3]:
df.columns

Index(['CustomerID', 'Genre', 'Age', 'Annual Income (k$)',
       'Spending Score (1-100)'],
      dtype='object')

In [4]:
# rename columns
df = df.rename(columns={'Genre':'Sex', 'Annual Income (k$)':'Annual_Income', 'Spending Score (1-100)':'Spending_Score'})

In [5]:
df['Annual_Income'] = df['Annual_Income'] * 1000
df

Unnamed: 0,CustomerID,Sex,Age,Annual_Income,Spending_Score
0,1,Male,19,15000,39
1,2,Male,21,15000,81
2,3,Female,20,16000,6
3,4,Female,23,16000,77
4,5,Female,31,17000,40
...,...,...,...,...,...
184,185,Female,41,99000,39
185,186,Male,30,99000,97
186,187,Female,54,101000,24
187,188,Male,28,101000,68


In [6]:
# keep required columns
req_cols = ['Sex', 'Age', 'Annual_Income', 'Spending_Score'] 
df=df[req_cols]
df

Unnamed: 0,Sex,Age,Annual_Income,Spending_Score
0,Male,19,15000,39
1,Male,21,15000,81
2,Female,20,16000,6
3,Female,23,16000,77
4,Female,31,17000,40
...,...,...,...,...
184,Female,41,99000,39
185,Male,30,99000,97
186,Female,54,101000,24
187,Male,28,101000,68


In [7]:
# encode categorical feature (Sex)
le = LabelEncoder()
df.loc[:, 'Sex'] = le.fit_transform(df['Sex'].astype(str))
df

Unnamed: 0,Sex,Age,Annual_Income,Spending_Score
0,1,19,15000,39
1,1,21,15000,81
2,0,20,16000,6
3,0,23,16000,77
4,0,31,17000,40
...,...,...,...,...
184,0,41,99000,39
185,1,30,99000,97
186,0,54,101000,24
187,1,28,101000,68


In [8]:
# Convert the Columns to Float Before Scaling
cols = ['Age', 'Annual_Income', 'Spending_Score']
df.loc[:, cols] = df[cols].astype(float)

In [9]:
print(df.columns.dtype)

object


In [10]:
# scale age Annual_income, Spending_score
scaler = StandardScaler()
df.loc[:,['Age','Annual_Income', 'Spending_Score']] = scaler.fit_transform(df[['Age','Annual_Income', 'Spending_Score']])
df


 -5.62951647e-01 -1.19418484e+00 -2.82403562e-01 -1.12404782e+00
  1.75157006e+00 -6.33088668e-01  1.96198112e+00 -2.82403562e-01
  1.33074793e+00 -1.05391080e+00 -1.42129519e-01 -1.19418484e+00
 -2.82403562e-01 -1.33445888e+00  9.09925800e-01 -2.82403562e-01
 -2.82403562e-01 -9.83773775e-01  4.89103672e-01 -5.62951647e-01
  1.05019984e+00 -7.03225690e-01  4.18966651e-01 -2.82403562e-01
  6.82815445e-02 -1.12404782e+00  1.47102197e+00 -1.26432186e+00
  9.80062821e-01 -1.47473292e+00  6.99514736e-01 -1.26432186e+00
  2.08555587e-01 -6.33088668e-01 -2.12266541e-01 -1.33445888e+00
  1.82170708e+00 -1.05391080e+00  6.29377715e-01 -5.62951647e-01
  6.99514736e-01 -1.05391080e+00  7.69651758e-01 -8.43499732e-01
 -7.03225690e-01 -5.62951647e-01  6.99514736e-01 -4.22677605e-01
 -5.62951647e-01  1.40088495e+00  7.69651758e-01  5.59240694e-01
  8.39788779e-01  2.10225516e+00 -8.43499732e-01  9.80062821e-01
  2.17239218e+00 -1.40459590e+00  1.96198112e+00  1.05019984e+00
  1.68143303e+00 -1.47473

Unnamed: 0,Sex,Age,Annual_Income,Spending_Score
0,1,-1.404596,-1.863536,-0.436681
1,1,-1.264322,-1.863536,1.217912
2,0,-1.334459,-1.819377,-1.736718
3,0,-1.124048,-1.819377,1.060332
4,0,-0.562952,-1.775219,-0.397286
...,...,...,...,...
184,0,0.138419,1.845779,-0.436681
185,1,-0.633089,1.845779,1.848233
186,0,1.050200,1.934096,-1.027607
187,1,-0.773363,1.934096,0.705776


In [11]:
df_subset = df.iloc[:, 1:]
df_const = add_constant(df_subset)

In [12]:
# check for multicollinearity
vif_data = pd.DataFrame({
    'Feature': df_const.columns,
    'VIF': [variance_inflation_factor(df_const.values, i)
            for i in range(df_const.shape[1])]
})
print(vif_data.sort_values('VIF', ascending=False))

          Feature       VIF
1             Age  1.126048
3  Spending_Score  1.125651
2   Annual_Income  1.000477
0           const  1.000000


In [13]:
np.savez(
    "../data/processed/processed_data.npz",
    X_train= df.to_numpy(dtype=np.float64),
    feature_names= df.columns.to_numpy()
)

In [14]:
# save scaler object
joblib.dump(scaler, "../data/processed/scaler.joblib")

['../data/processed/scaler.joblib']