## Basics of Logistic Regression

#### Import relevant libraries

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm

%matplotlib inline

#### Load the data

In [2]:
df = pd.read_csv('../../Datasets/Admittance.csv')

df.head()

Unnamed: 0,SAT,Admitted
0,1363,No
1,1792,Yes
2,1954,Yes
3,1653,No
4,1593,No


In [3]:
df_new = df.copy()

df_new['Admitted'] = df_new['Admitted'].map({'No': 0, 'Yes': 1})
df_new.head(10)

Unnamed: 0,SAT,Admitted
0,1363,0
1,1792,1
2,1954,1
3,1653,0
4,1593,0
5,1755,1
6,1775,1
7,1887,1
8,1893,1
9,1580,0


#### Declare the predictors and target

In [4]:
y = df_new['Admitted']
x = df_new['SAT']

#### Regression

In [5]:
x_new = sm.add_constant(x)
log_reg = sm.Logit(y, x_new)  # 35 itreations

res = log_reg.fit()

Optimization terminated successfully.
         Current function value: 0.137766
         Iterations 10


#### Summary

In [6]:
res.summary()

0,1,2,3
Dep. Variable:,Admitted,No. Observations:,168.0
Model:,Logit,Df Residuals:,166.0
Method:,MLE,Df Model:,1.0
Date:,"Wed, 21 Jul 2021",Pseudo R-squ.:,0.7992
Time:,15:03:47,Log-Likelihood:,-23.145
converged:,True,LL-Null:,-115.26
Covariance Type:,nonrobust,LLR p-value:,5.805000000000001e-42

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-69.9128,15.737,-4.443,0.000,-100.756,-39.070
SAT,0.0420,0.009,4.454,0.000,0.024,0.060


### With two Predictors

In [7]:
df_1 = pd.read_csv('../../Datasets/Admittance_with_Gender.csv')

df_1.head()

Unnamed: 0,SAT,Admitted,Gender
0,1363,No,Male
1,1792,Yes,Female
2,1954,Yes,Female
3,1653,No,Male
4,1593,No,Male


In [8]:
df_1['Admitted'] = df_1['Admitted'].map({"No": 0, "Yes":1})
df_1['Gender'] = df_1['Gender'].map({"Male": 0, "Female":1})

df_1.head()

Unnamed: 0,SAT,Admitted,Gender
0,1363,0,0
1,1792,1,1
2,1954,1,1
3,1653,0,0
4,1593,0,0


#### Declare the predictors and target

In [9]:
y = df_1['Admitted']
x = df_1[['Gender', 'SAT']]

#### Regression

In [10]:
x_new = sm.add_constant(x)
log_reg = sm.Logit(y, x_new)  # 35 itreations

res = log_reg.fit()

Optimization terminated successfully.
         Current function value: 0.120117
         Iterations 10


#### Summary

In [11]:
res.summary()

0,1,2,3
Dep. Variable:,Admitted,No. Observations:,168.0
Model:,Logit,Df Residuals:,165.0
Method:,MLE,Df Model:,2.0
Date:,"Wed, 21 Jul 2021",Pseudo R-squ.:,0.8249
Time:,15:03:49,Log-Likelihood:,-20.18
converged:,True,LL-Null:,-115.26
Covariance Type:,nonrobust,LLR p-value:,5.1180000000000006e-42

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-68.3489,16.454,-4.154,0.000,-100.598,-36.100
Gender,1.9449,0.846,2.299,0.022,0.287,3.603
SAT,0.0406,0.010,4.129,0.000,0.021,0.060


In [12]:
np.exp(1.9449)

6.992932526814459

**NOTE:** Given the same SAT score, a female has 7 times higher odds to get admitted 

In [13]:
np.set_printoptions(formatter={'float':lambda x: "{0:0.2f}".format(x)})
res.predict()

array([0.00, 1.00, 1.00, 0.23, 0.02, 0.99, 1.00, 1.00, 1.00, 0.01, 1.00,
       1.00, 0.76, 0.00, 0.60, 1.00, 0.11, 0.12, 0.51, 1.00, 1.00, 1.00,
       0.00, 0.01, 0.97, 1.00, 0.48, 0.99, 1.00, 0.99, 0.00, 0.83, 0.25,
       1.00, 1.00, 1.00, 0.31, 1.00, 0.23, 0.00, 0.02, 0.45, 1.00, 0.00,
       0.99, 0.00, 0.99, 0.00, 0.00, 0.01, 0.00, 1.00, 0.92, 0.02, 1.00,
       0.00, 0.37, 0.98, 0.12, 1.00, 0.00, 0.78, 1.00, 1.00, 0.98, 0.00,
       0.00, 0.00, 1.00, 0.00, 0.78, 0.12, 0.00, 0.99, 1.00, 1.00, 0.00,
       0.30, 1.00, 1.00, 0.00, 1.00, 1.00, 0.85, 1.00, 1.00, 0.00, 1.00,
       1.00, 0.89, 0.83, 0.00, 0.98, 0.97, 0.00, 1.00, 1.00, 0.03, 0.99,
       0.96, 1.00, 0.00, 1.00, 0.01, 0.01, 1.00, 1.00, 1.00, 0.00, 0.00,
       0.02, 0.33, 0.00, 1.00, 0.09, 0.00, 0.97, 0.00, 0.75, 1.00, 1.00,
       0.01, 0.01, 0.00, 1.00, 0.00, 0.99, 0.57, 0.54, 0.87, 0.83, 0.00,
       1.00, 0.00, 0.00, 0.00, 1.00, 0.04, 0.00, 0.01, 1.00, 0.99, 0.52,
       1.00, 1.00, 0.05, 0.00, 0.00, 0.00, 0.68, 1.

In [14]:
np.array(df_1.Admitted)

array([0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0], dtype=int64)

In [15]:
res.pred_table()

array([[69.00, 5.00],
       [4.00, 90.00]])

In [16]:
cm_df = pd.DataFrame(res.pred_table())
cm_df.columns = ['Predicted 0', 'Predicted 1']
cm_df = cm_df.rename(index={0: 'Actual 0', 1: 'Actual 1'})
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,69.0,5.0
Actual 1,4.0,90.0


In [17]:
print('\033[1m' + 'Accuracy' + '\033[0m')
print((cm_df.iloc[0, 0] + cm_df.iloc[1, 1])/cm_df.sum().sum())

[1mAccuracy[0m
0.9464285714285714


### Testing the model and assessing its accuracy

In [18]:
test_df = pd.read_csv('../../Datasets/Admittance_with_Gender_Test.csv')
test_df.head()

Unnamed: 0,SAT,Admitted,Gender
0,1323,No,Male
1,1725,Yes,Female
2,1762,Yes,Female
3,1777,Yes,Male
4,1665,No,Male


In [19]:
test_df['Admitted'] = test_df['Admitted'].map({"No": 0, "Yes":1})
test_df['Gender'] = test_df['Gender'].map({"Male": 0, "Female":1})

test_df.head()

Unnamed: 0,SAT,Admitted,Gender
0,1323,0,0
1,1725,1,1
2,1762,1,1
3,1777,1,0
4,1665,0,0


In [29]:
test_actual = test_df['Admitted']
test_data = test_df[['Gender', 'SAT']]
x_test = sm.add_constant(test_data)

In [36]:
def confusion_matrix(data, actual_values, model):
    pred = model.predict(data)
    bins= np.array([0, 0.5, 1])
    cm = np.histogram2d(actual_values, pred, bins=bins)[0]
    accuracy = (cm[0, 0]+cm[1, 1])/cm.sum()
    return cm, accuracy

In [38]:
cm = confusion_matrix(x_test, test_actual, res)
cm

(array([[5.00, 1.00],
        [1.00, 12.00]]),
 0.8947368421052632)

In [39]:
cm_df = pd.DataFrame(cm[0])
cm_df.columns = ['Predicted 0', 'Predicted 1']
cm_df = cm_df.rename(index={0: 'Actual 0', 1: 'Actual 1'})
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5.0,1.0
Actual 1,1.0,12.0
