In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline   
import seaborn as sns

In [2]:
df = pd.read_csv("Mall_Customers.csv")

In [3]:
df.head()


Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19.0,15.0,39.0
1,2,Male,21.0,15.0,81.0
2,3,Female,20.0,16.0,6.0
3,4,Female,23.0,16.0,77.0
4,5,Female,31.0,17.0,40.0


# Checking for MIssing Value

In [4]:
features_with_na= [features for features in df.columns if df[features].isnull().sum()>1]
print(features_with_na)

['Gender', 'Age', 'Annual Income (k$)', 'Spending Score (1-100)']


In [5]:
#to find the percentage of missing value in each feature 
for feature in features_with_na:
    print(feature,np.round(df[feature].isnull().mean(),4), '% missing value')
    

Gender 0.0098 % missing value
Age 0.0195 % missing value
Annual Income (k$) 0.0195 % missing value
Spending Score (1-100) 0.0146 % missing value


In [6]:
# Impute missing values for numerical features
for feature in ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']:
    df[feature].fillna(df[feature].mean(), inplace=True)

# Impute missing values for categorical feature
df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)

# Confirm that there are no more missing values
print(df.isnull().sum())

CustomerID                0
Gender                    0
Age                       0
Annual Income (k$)        0
Spending Score (1-100)    0
dtype: int64


In [7]:
df.tail()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
200,201,Female,38.850746,123.0,22.0
201,202,Male,39.0,60.870647,56.0
202,203,Female,38.850746,60.870647,50.089109
203,204,Female,38.850746,60.870647,50.089109
204,205,Female,38.850746,60.870647,50.089109


# One Hot Encoding 

In [8]:
df = pd.get_dummies(df, columns=['Gender'], drop_first=True)

In [9]:
df.head()

Unnamed: 0,CustomerID,Age,Annual Income (k$),Spending Score (1-100),Gender_Male
0,1,19.0,15.0,39.0,True
1,2,21.0,15.0,81.0,True
2,3,20.0,16.0,6.0,False
3,4,23.0,16.0,77.0,False
4,5,31.0,17.0,40.0,False


In [10]:
from sklearn.preprocessing import MinMaxScaler


scaler = MinMaxScaler()

# Normalize numerical variables
df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']] = scaler.fit_transform(df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']])


In [11]:
df.head()

Unnamed: 0,CustomerID,Age,Annual Income (k$),Spending Score (1-100),Gender_Male
0,1,0.019231,0.0,0.387755,True
1,2,0.057692,0.0,0.816327,True
2,3,0.038462,0.008197,0.05102,False
3,4,0.096154,0.008197,0.77551,False
4,5,0.25,0.016393,0.397959,False


In [12]:

target_variable = 'Spending Score (1-100)'

# Extract features (independent variables)
features = df.drop(columns=[target_variable])

# Extract target variable (dependent variable)
target = df[target_variable]


In [13]:
features

Unnamed: 0,CustomerID,Age,Annual Income (k$),Gender_Male
0,1,0.019231,0.000000,True
1,2,0.057692,0.000000,True
2,3,0.038462,0.008197,False
3,4,0.096154,0.008197,False
4,5,0.250000,0.016393,False
...,...,...,...,...
200,201,0.400976,0.885246,False
201,202,0.403846,0.375989,True
202,203,0.400976,0.375989,False
203,204,0.400976,0.375989,False


In [14]:
target

0      0.387755
1      0.816327
2      0.051020
3      0.775510
4      0.397959
         ...   
200    0.214286
201    0.561224
202    0.500909
203    0.500909
204    0.500909
Name: Spending Score (1-100), Length: 205, dtype: float64

In [15]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(df.iloc[:,1:],df.iloc[:,0],test_size=0.2,random_state=9)

In [16]:
X_train.head()

Unnamed: 0,Age,Annual Income (k$),Spending Score (1-100),Gender_Male
111,0.019231,0.393443,0.540816,False
93,0.423077,0.368852,0.397959,False
148,0.307692,0.516393,0.214286,False
21,0.134615,0.07377,0.734694,True
28,0.423077,0.114754,0.306122,False


In [17]:
X_test.head()

Unnamed: 0,Age,Annual Income (k$),Spending Score (1-100),Gender_Male
172,0.346154,0.590164,0.091837,True
89,0.615385,0.352459,0.459184,False
55,0.557692,0.229508,0.408163,True
73,0.807692,0.286885,0.561224,False
158,0.307692,0.516393,0.0,True
