# Logistic Regression Model for Classification Problem

### Selling Potential of Properties based on Past Data

### Step - 1: Loading Different Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sn
from sklearn.linear_model import LinearRegression
import seaborn as sns
sns.set()

### Step - 2: Loading Preprossed Data

In [2]:
df = pd.read_csv('Data for Logistic Regression Model.csv')
df.head()

Unnamed: 0,price,resid_area,air_qual,room_num,age,teachers,poor_prop,n_hos_beds,n_hot_rooms,rainfall,parks,Sold,avg_dist,airport_YES,waterbody_Lake,waterbody_Lake and River,waterbody_River
0,24.0,32.31,0.538,6.575,65.2,24.7,4.98,5.48,11.192,23,0.049347,0,4.0875,1,0,0,1
1,21.6,37.07,0.469,6.421,78.9,22.2,9.14,7.332,12.1728,42,0.046146,1,4.9675,0,1,0,0
2,34.7,37.07,0.469,7.185,61.1,22.2,4.03,7.394,46.19856,38,0.045764,0,4.9675,0,0,0,0
3,33.4,32.18,0.458,6.998,45.8,21.3,2.94,9.268,11.2672,45,0.047151,0,6.065,1,1,0,0
4,36.2,32.18,0.458,7.147,54.2,21.3,5.33,8.824,11.2896,55,0.039474,0,6.0625,0,1,0,0


### Step - 3: Create Logistic Regression Model using Single Predictor

#### Method - 1: Using SKLEARN Library

In [3]:
# Here, Price is an independent variable
X = df[['price']]

In [4]:
# Here, Sold is a dependent variable
y = df['Sold']

In [5]:
# Checking data of an independent variable
X.head()

Unnamed: 0,price
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2


In [6]:
# Checking data of a dependent variable
y.head()

0    0
1    1
2    0
3    0
4    0
Name: Sold, dtype: int64

In [7]:
# Importing Logistic Regression Model from Sklearn Library
from sklearn.linear_model import LogisticRegression

In [8]:
# Defining the variable for Logistic Regression Model
clf_lrs = LogisticRegression()

In [9]:
# Fit the model to constant
clf_lrs.fit(X, y)

#### Equation of Logistic Regression = (e^(B0+B1*x))/(1+e^(B0+B1*x))

In [10]:
# To know the value of Beta - 1 (B1)
clf_lrs.coef_

array([[-0.03571865]])

In [11]:
# To know the value of Beta - 0 (B0)
clf_lrs.intercept_

array([0.61477516])

#### Method - 2: Using STATSMODEL

#### STATSMODEL has Beta - 0 (B0) term equals to Zero by default

In [12]:
# Add BO as a constant term
X_cons = sn.add_constant(X)

In [13]:
# Check the constant values
X_cons.head()

Unnamed: 0,const,price
0,1.0,24.0
1,1.0,21.6
2,1.0,34.7
3,1.0,33.4
4,1.0,36.2


In [14]:
import statsmodels.discrete.discrete_model as sm

In [15]:
# Fit the Model
logit = sm.Logit(y, X_cons).fit()

Optimization terminated successfully.
         Current function value: 0.676690
         Iterations 5


In [16]:
# To see the summary of the model
logit.summary()

0,1,2,3
Dep. Variable:,Sold,No. Observations:,506.0
Model:,Logit,Df Residuals:,504.0
Method:,MLE,Df Model:,1.0
Date:,"Fri, 21 Oct 2022",Pseudo R-squ.:,0.01788
Time:,12:54:54,Log-Likelihood:,-342.41
converged:,True,LL-Null:,-348.64
Covariance Type:,nonrobust,LLR p-value:,0.0004142

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.6149,0.248,2.484,0.013,0.130,1.100
price,-0.0357,0.010,-3.417,0.001,-0.056,-0.015


In [17]:
# From above summary, we have B0 = 0.6149, and B1 = -0.0357
# p-value should be less than 0.05 (threshold) to establish relationship

### Step - 4: Create Logistic Regression Model using Multiple Predictors

#### Method - 1: Using SKLEARN Library

In [18]:
# Here, multiple predictors other than Sold are an independent variables
Xm = df.drop(['Sold'], axis = 1)

In [19]:
# Here, Sold is a dependent variable
ym = df['Sold']

In [20]:
# Checking data of an independent variables
Xm.head()

Unnamed: 0,price,resid_area,air_qual,room_num,age,teachers,poor_prop,n_hos_beds,n_hot_rooms,rainfall,parks,avg_dist,airport_YES,waterbody_Lake,waterbody_Lake and River,waterbody_River
0,24.0,32.31,0.538,6.575,65.2,24.7,4.98,5.48,11.192,23,0.049347,4.0875,1,0,0,1
1,21.6,37.07,0.469,6.421,78.9,22.2,9.14,7.332,12.1728,42,0.046146,4.9675,0,1,0,0
2,34.7,37.07,0.469,7.185,61.1,22.2,4.03,7.394,46.19856,38,0.045764,4.9675,0,0,0,0
3,33.4,32.18,0.458,6.998,45.8,21.3,2.94,9.268,11.2672,45,0.047151,6.065,1,1,0,0
4,36.2,32.18,0.458,7.147,54.2,21.3,5.33,8.824,11.2896,55,0.039474,6.0625,0,1,0,0


In [21]:
# Checking data of a dependent variable
ym.head()

0    0
1    1
2    0
3    0
4    0
Name: Sold, dtype: int64

In [22]:
# Defining the variable for Logistic Regression Model
clf_lrs_m = LogisticRegression()

In [23]:
# Fit the model to constant
clf_lrs_m.fit(Xm, ym)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
# To know the value of Beta values other than B0
clf_lrs_m.coef_

array([[-0.24635721, -0.01729754, -0.11019466,  0.86261003, -0.00599865,
         0.22819592, -0.21021844,  0.18006637, -0.09086589, -0.00704699,
        -0.00499206, -0.32611305, -0.10521309, -0.09487418, -0.01512375,
         0.20180478]])

In [25]:
# To know the value of Beta - 0 (B0)
clf_lrs_m.intercept_

array([0.01671291])

#### Method - 2: Using STATSMODEL

In [26]:
# Add BO as a constant term
X_cons_m = sn.add_constant(Xm)

In [27]:
# Check the constant values
X_cons_m.head()

Unnamed: 0,const,price,resid_area,air_qual,room_num,age,teachers,poor_prop,n_hos_beds,n_hot_rooms,rainfall,parks,avg_dist,airport_YES,waterbody_Lake,waterbody_Lake and River,waterbody_River
0,1.0,24.0,32.31,0.538,6.575,65.2,24.7,4.98,5.48,11.192,23,0.049347,4.0875,1,0,0,1
1,1.0,21.6,37.07,0.469,6.421,78.9,22.2,9.14,7.332,12.1728,42,0.046146,4.9675,0,1,0,0
2,1.0,34.7,37.07,0.469,7.185,61.1,22.2,4.03,7.394,46.19856,38,0.045764,4.9675,0,0,0,0
3,1.0,33.4,32.18,0.458,6.998,45.8,21.3,2.94,9.268,11.2672,45,0.047151,6.065,1,1,0,0
4,1.0,36.2,32.18,0.458,7.147,54.2,21.3,5.33,8.824,11.2896,55,0.039474,6.0625,0,1,0,0


In [28]:
# Fit the Model
logit_m = sm.Logit(ym, X_cons_m).fit()

Optimization terminated successfully.
         Current function value: 0.556433
         Iterations 6


In [29]:
# To see the summary of the model
logit_m.summary()

0,1,2,3
Dep. Variable:,Sold,No. Observations:,506.0
Model:,Logit,Df Residuals:,489.0
Method:,MLE,Df Model:,16.0
Date:,"Fri, 21 Oct 2022",Pseudo R-squ.:,0.1924
Time:,12:54:55,Log-Likelihood:,-281.56
converged:,True,LL-Null:,-348.64
Covariance Type:,nonrobust,LLR p-value:,9.93e-21

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-2.1383,2.649,-0.807,0.420,-7.331,3.054
price,-0.2741,0.033,-8.313,0.000,-0.339,-0.209
resid_area,0.0192,0.027,0.720,0.471,-0.033,0.071
air_qual,-7.4183,2.691,-2.757,0.006,-12.693,-2.144
room_num,1.1067,0.277,4.001,0.000,0.565,1.649
age,-0.0020,0.007,-0.302,0.762,-0.015,0.011
teachers,0.3150,0.064,4.937,0.000,0.190,0.440
poor_prop,-0.2077,0.034,-6.149,0.000,-0.274,-0.141
n_hos_beds,0.1760,0.071,2.467,0.014,0.036,0.316


### Step - 5: Predicting and Creating Confusion Matrix

In [30]:
clf_lrs_m.predict_proba(Xm)

array([[0.12706722, 0.87293278],
       [0.39754993, 0.60245007],
       [0.98071817, 0.01928183],
       ...,
       [0.28594214, 0.71405786],
       [0.28060984, 0.71939016],
       [0.16347215, 0.83652785]])

In [31]:
y_pred = clf_lrs_m.predict(Xm)
y_pred

array([1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,

In [32]:
y_pred_03 = (clf_lrs_m.predict_proba(Xm)[:,1] >= 0.3)
y_pred_03

array([ True,  True, False,  True, False, False,  True, False, False,
        True,  True,  True, False,  True,  True,  True,  True, False,
        True,  True, False,  True, False, False,  True,  True,  True,
        True,  True,  True, False,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False, False, False,  True,  True,  True,  True,
       False, False, False, False,  True,  True,  True, False,  True,
       False, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False, False,
        True,  True,  True,  True,  True,  True, False, False, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False, False, False,  True,
       False, False,

In [33]:
from sklearn.metrics import confusion_matrix
confusion_matrix(ym, y_pred)

array([[196,  80],
       [ 81, 149]], dtype=int64)

In [34]:
# From above matrix,
# 196 stands for Predicted Value: Sold, and Actual Value: Sold (True Positives)
# 80 stands for Predicted Value: Not sold, and Actual Value: Sold (False Positives)
# 81 stands for Predicted Value: Sold, and Actual Value: Not Sold (False Negatives)
# 149 stands for Predicted Value: Not Sold, and Actual Value: Not Sold (True Negatives)

In [35]:
confusion_matrix(ym, y_pred_03)

array([[121, 155],
       [ 18, 212]], dtype=int64)

In [36]:
# From above matrix,
# 121 stands for Predicted Value: Sold, and Actual Value: Sold (True Positives)
# 155 stands for Predicted Value: Not sold, and Actual Value: Sold (False Positives)
# 18 stands for Predicted Value: Sold, and Actual Value: Not Sold (False Negatives)
# 212 stands for Predicted Value: Not Sold, and Actual Value: Not Sold (True Negatives)

### Step - 6: Performance Metrics

#### Precision = True Positives/(True Positives + False Positives)
#### Recall = True Positives/ (True Positives + False Negatives)

In [37]:
from sklearn.metrics import precision_score, recall_score

In [38]:
precision_score(ym, y_pred)

0.6506550218340611

In [39]:
recall_score(ym, y_pred)

0.6478260869565218

In [40]:
from sklearn.metrics import roc_auc_score

In [41]:
roc_auc_score(ym, y_pred)

0.6789855072463769

In [42]:
precision_score(ym, y_pred_03)

0.5776566757493188

In [43]:
recall_score(ym, y_pred_03)

0.9217391304347826

In [44]:
roc_auc_score(ym, y_pred_03)

0.6800724637681159

### Step - 7: Train-Test-Split

In [45]:
from sklearn.model_selection import train_test_split

In [46]:
X_train, X_test, y_train, y_test = train_test_split(Xm, ym, test_size = 0.2, random_state = 0)

In [47]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(404, 16) (102, 16) (404,) (102,)


In [48]:
clf_LR = LogisticRegression()
clf_LR.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [49]:
y_test_pred = clf_LR.predict(X_test)

In [50]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [51]:
confusion_matrix(y_test, y_test_pred)

array([[37, 21],
       [14, 30]], dtype=int64)

In [52]:
# From above matrix,
# 37 stands for Predicted Value: Sold, and Actual Value: Sold (True Positives)
# 21 stands for Predicted Value: Not sold, and Actual Value: Sold (False Positives)
# 14 stands for Predicted Value: Sold, and Actual Value: Not Sold (False Negatives)
# 30 stands for Predicted Value: Not Sold, and Actual Value: Not Sold (True Negatives)

In [53]:
accuracy_score(y_test, y_test_pred)

0.6568627450980392