### Logestic Regression Model Building

#### Aim of the Project :
#### To build a Logistic Regression model to predict whether a user will purchase a product (Purchased) based on their demographic information, including gender, age, and estimated salary.

In [1]:
# importing requires liberaries
import pandas as pd
import numpy as np

In [2]:
dataset = pd.read_csv('User_Data.csv')
dataset.head(11)

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
5,15728773,Male,27,58000,0
6,15598044,Female,27,84000,0
7,15694829,Female,32,150000,1
8,15600575,Male,25,33000,0
9,15727311,Female,35,65000,0


In [3]:
# checking null values 
dataset.isnull().sum()

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [5]:
dataset.describe()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased
count,400.0,400.0,400.0,400.0
mean,15691540.0,37.655,69742.5,0.3575
std,71658.32,10.482877,34096.960282,0.479864
min,15566690.0,18.0,15000.0,0.0
25%,15626760.0,29.75,43000.0,0.0
50%,15694340.0,37.0,70000.0,0.0
75%,15750360.0,46.0,88000.0,1.0
max,15815240.0,60.0,150000.0,1.0


In [6]:
dataset['Purchased'].value_counts()

Purchased
0    257
1    143
Name: count, dtype: int64

In [7]:
# here you can see most of the customer did't buy our product

In [8]:
# divide data into dependent(y) and independent(x)

x= dataset.iloc[:,[2,3]].values   # add values to make it an array
y= dataset.iloc[:,-1].values

In [9]:
# now splitting of data into training and testing
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.25, random_state=42)

In [10]:
x_train

array([[    57, 122000],
       [    39,  71000],
       [    47,  25000],
       [    24,  19000],
       [    36,  50000],
       [    32, 150000],
       [    48,  29000],
       [    30, 107000],
       [    60,  34000],
       [    38,  61000],
       [    33,  31000],
       [    39,  71000],
       [    55,  39000],
       [    49,  39000],
       [    43, 112000],
       [    27,  20000],
       [    26,  17000],
       [    37,  93000],
       [    42,  54000],
       [    35,  61000],
       [    29,  75000],
       [    38,  80000],
       [    45,  26000],
       [    54, 108000],
       [    46,  23000],
       [    23,  28000],
       [    37,  75000],
       [    42,  65000],
       [    35,  71000],
       [    51, 146000],
       [    39,  96000],
       [    24,  89000],
       [    58,  95000],
       [    25,  22000],
       [    41,  59000],
       [    28,  89000],
       [    42,  80000],
       [    42, 108000],
       [    46,  96000],
       [    47, 113000],


In [11]:
from sklearn.preprocessing import StandardScaler

In [12]:
SC = StandardScaler()

In [13]:
x_train = SC.fit_transform(x_train)

In [14]:
x_test = SC.fit_transform(x_test)

In [15]:
x_test

array([[ 0.75872215, -1.41078892],
       [ 1.9109871 ,  0.5590031 ],
       [-0.83672162, -0.75419158],
       [ 0.93599368,  0.79776576],
       [-0.74808586, -1.23171691],
       [-0.65945009, -0.21697557],
       [ 0.84735791,  1.12606443],
       [-0.74808586,  0.40977643],
       [ 0.22690756,  0.17101376],
       [ 0.40417909, -0.12743957],
       [-0.21627127, -0.12743957],
       [ 1.37917251, -1.05264491],
       [-1.27990045, -0.63481024],
       [-1.54580774, -1.38094358],
       [-0.65945009,  0.52915776],
       [-0.21627127,  1.15590977],
       [ 1.29053674, -0.93326358],
       [ 0.75872215,  0.14116843],
       [ 0.13827179, -0.81388224],
       [ 1.6450798 , -0.27666624],
       [-1.36853621, -1.26156225],
       [-0.74808586,  0.32024043],
       [ 0.84735791, -1.38094358],
       [ 1.9109871 ,  0.20085909],
       [-1.63444351, -1.50032492],
       [ 1.20190098, -1.38094358],
       [ 0.40417909,  0.32024043],
       [-0.03899974, -0.48558358],
       [ 1.55644404,

In [16]:
x_train = pd.DataFrame(x_train,columns =['Age','Salary']) # converting into dataframe
x_test = pd.DataFrame(x_test,columns =['Age','Salary'])

In [17]:
x_train, x_test

(          Age    Salary
 0    1.892589  1.521894
 1    0.125038  0.032132
 2    0.910616 -1.311575
 3   -1.347922 -1.486841
 4   -0.169554 -0.581299
 ..        ...       ...
 295 -1.347922 -1.253153
 296 -1.151527 -1.019465
 297  0.517827  1.843215
 298  0.125038  0.207398
 299 -0.562343  0.470297
 
 [300 rows x 2 columns],
          Age    Salary
 0   0.758722 -1.410789
 1   1.910987  0.559003
 2  -0.836722 -0.754192
 3   0.935994  0.797766
 4  -0.748086 -1.231717
 ..       ...       ...
 95  0.138272  0.290395
 96  1.910987  1.812507
 97  0.404179 -0.157285
 98  1.379173  2.200496
 99 -0.304907  1.275291
 
 [100 rows x 2 columns])

In [18]:
# training Logistic Regression model
from sklearn.linear_model import LogisticRegression

In [19]:
LR = LogisticRegression()

In [20]:
LR.fit(x_train,y_train)

In [21]:
y_pred = LR.predict(x_test)

In [22]:
y_pred

array([0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0], dtype=int64)

In [23]:
y_test

array([0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1], dtype=int64)

In [24]:
# Confusion matrix
from sklearn.metrics import confusion_matrix

In [25]:
CM = confusion_matrix(y_pred,y_test)
CM

array([[61, 12],
       [ 2, 25]], dtype=int64)

In [26]:
# performance measure => accuracy
from sklearn.metrics import accuracy_score,classification_report
AS = accuracy_score(y_pred,y_test)
print('The accuracy of our Logistic Regression Model is',AS)

The accuracy of our Logistic Regression Model is 0.86


In [27]:
# Classification Report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.97      0.84      0.90        73
           1       0.68      0.93      0.78        27

    accuracy                           0.86       100
   macro avg       0.82      0.88      0.84       100
weighted avg       0.89      0.86      0.87       100

