In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("drive/My Drive/Colab Notebooks/Customer_Behaviour.csv")

In [3]:
data

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [5]:
data.describe()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased
count,400.0,400.0,400.0,400.0
mean,15691540.0,37.655,69742.5,0.3575
std,71658.32,10.482877,34096.960282,0.479864
min,15566690.0,18.0,15000.0,0.0
25%,15626760.0,29.75,43000.0,0.0
50%,15694340.0,37.0,70000.0,0.0
75%,15750360.0,46.0,88000.0,1.0
max,15815240.0,60.0,150000.0,1.0


In [47]:
def preprocess_inputs(df, engineer_features=False):
  df = df.copy()
  #Drop User ID column
  df = df.drop('User ID', axis=1)
  #Binary encode gender column
  df['Gender'] = df['Gender'].replace({"Female":0, "Male":1})

  #feature engineering
  if engineer_features == True:
    threshold_salary = df['EstimatedSalary'].quantile(0.95)
    df['High Income'] = df['EstimatedSalary'].apply(lambda x: 1 if x>= threshold_salary else 0)

    old_age_threshold = df['Age'].quantile(0.75)
    young_age_threshold = df['Age'].quantile(0.25)

    df['Old Age'] = df['Age'].apply(lambda x:1 if x>= old_age_threshold else 0)
    df['Young Age'] = df['Age'].apply(lambda x:1 if x <= young_age_threshold else 0)

  



  #Drop and Split the dataset
  y = df['Purchased']
  X = df.drop('Purchased', axis=1)

  X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, shuffle=True, random_state=42)
  #scale the data
  scaler = StandardScaler()
  scaler.fit(X_train)
  X_train = pd.DataFrame(scaler.transform(X_train), index= X_train.index, columns = X_train.columns)
  X_test = pd.DataFrame(scaler.transform(X_test), index = X_test.index, columns = X_test.columns)
  return X_train, X_test, y_train, y_test

In [35]:
X_train, X_test, y_train, y_test = preprocess_inputs(data, engineer_features=False)

In [36]:
X_train

Unnamed: 0,Gender,Age,EstimatedSalary
247,-0.986754,1.892589,1.521894
110,-0.986754,0.125038,0.032132
16,1.013423,0.910616,-1.311575
66,1.013423,-1.347922,-1.486841
153,-0.986754,-0.169554,-0.581299
...,...,...,...
71,-0.986754,-1.347922,-1.253153
106,-0.986754,-1.151527,-1.019465
270,-0.986754,0.517827,1.843215
348,1.013423,0.125038,0.207398


In [37]:
X_test

Unnamed: 0,Gender,Age,EstimatedSalary
209,-0.986754,0.812419,-1.399208
280,-0.986754,2.088984,0.528719
33,-0.986754,-0.955132,-0.756565
210,-0.986754,1.008814,0.762408
93,-0.986754,-0.856935,-1.223942
...,...,...,...
314,-0.986754,0.125038,0.265820
373,1.013423,2.088984,1.755582
380,1.013423,0.419630,-0.172345
239,-0.986754,1.499800,2.135325


## Training the Model without Feature Engineering

In [38]:
model = LogisticRegression()
model = model.fit(X_train, y_train)

In [39]:
model.score(X_test, y_test)

0.88

In [40]:
y_pred = model.predict(X_test)

In [41]:
y_pred

array([0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0])

## Training with Feature Engineering

In [48]:
X_train, X_test, y_train, y_test = preprocess_inputs(data, engineer_features=True)
X_train

Unnamed: 0,Gender,Age,EstimatedSalary,High Income,Old Age,Young Age
247,-0.986754,1.892589,1.521894,-0.229416,1.763403,-0.546536
110,-0.986754,0.125038,0.032132,-0.229416,-0.567085,-0.546536
16,1.013423,0.910616,-1.311575,-0.229416,1.763403,-0.546536
66,1.013423,-1.347922,-1.486841,-0.229416,-0.567085,1.829707
153,-0.986754,-0.169554,-0.581299,-0.229416,-0.567085,-0.546536
...,...,...,...,...,...,...
71,-0.986754,-1.347922,-1.253153,-0.229416,-0.567085,1.829707
106,-0.986754,-1.151527,-1.019465,-0.229416,-0.567085,1.829707
270,-0.986754,0.517827,1.843215,-0.229416,-0.567085,-0.546536
348,1.013423,0.125038,0.207398,-0.229416,-0.567085,-0.546536


In [49]:
model_1 = LogisticRegression()
model_1 = model_1.fit(X_train, y_train)

In [50]:
model_1.score(X_test, y_test)

0.93