In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [3]:
raw_data = pd.read_csv('2.01.+Admittance.csv')

In [4]:
raw_data.head()

Unnamed: 0,SAT,Admitted
0,1363,No
1,1792,Yes
2,1954,Yes
3,1653,No
4,1593,No


In [7]:
data = raw_data.copy()
data['Admitted'] = data['Admitted'].map({'No': 0, 'Yes': 1})

In [8]:
data.head()

Unnamed: 0,SAT,Admitted
0,1363,0
1,1792,1
2,1954,1
3,1653,0
4,1593,0


In [9]:
y = data['Admitted']
x1 = data['SAT']
x = sm.add_constant(x1)

In [12]:
reg = sm.Logit(y,x)
result = reg.fit()

Optimization terminated successfully.
         Current function value: 0.137766
         Iterations 10


In [13]:
result.summary()

0,1,2,3
Dep. Variable:,Admitted,No. Observations:,168.0
Model:,Logit,Df Residuals:,166.0
Method:,MLE,Df Model:,1.0
Date:,"Wed, 18 Oct 2023",Pseudo R-squ.:,0.7992
Time:,14:38:56,Log-Likelihood:,-23.145
converged:,True,LL-Null:,-115.26
Covariance Type:,nonrobust,LLR p-value:,5.805000000000001e-42

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-69.9128,15.737,-4.443,0.000,-100.756,-39.070
SAT,0.0420,0.009,4.454,0.000,0.024,0.060


In [14]:
# We have succesfully created our first logistic regression.

In [15]:
# Now we want to create a logistic regression that involves cateorical data.

In [16]:
new_data = pd.read_csv('2.02.+Binary+predictors.csv')

In [38]:
data2 = new_data.copy()

In [39]:
data2.head()

Unnamed: 0,SAT,Admitted,Gender
0,1363,No,Male
1,1792,Yes,Female
2,1954,Yes,Female
3,1653,No,Male
4,1593,No,Male


In [40]:
data2['Admitted'] = data2['Admitted'].map({'No':0, 'Yes':1})
data2['Gender'] = data2['Gender'].map({'Male':0, 'Female':1})

In [41]:
data2.head()

Unnamed: 0,SAT,Admitted,Gender
0,1363,0,0
1,1792,1,1
2,1954,1,1
3,1653,0,0
4,1593,0,0


In [43]:
y = data2['Admitted']
x2 = data2['Gender']
k = sm.add_constant(x2)

In [49]:
reg2 = sm.Logit(y,k)
result2 = reg2.fit()
result2.summary()

Optimization terminated successfully.
         Current function value: 0.572260
         Iterations 5


0,1,2,3
Dep. Variable:,Admitted,No. Observations:,168.0
Model:,Logit,Df Residuals:,166.0
Method:,MLE,Df Model:,1.0
Date:,"Wed, 18 Oct 2023",Pseudo R-squ.:,0.1659
Time:,15:24:46,Log-Likelihood:,-96.14
converged:,True,LL-Null:,-115.26
Covariance Type:,nonrobust,LLR p-value:,6.283e-10

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.6436,0.222,-2.901,0.004,-1.078,-0.209
Gender,2.0786,0.363,5.727,0.000,1.367,2.790


In [50]:
# We can see that gender is a significant predictor from its p-value and from its coefficient we calculate using the logit 
# formular that female are 8 more times likely to get admitted than male.

In [51]:
# Now we want to explore a logistic regression with 2 variables

In [55]:
y = data2['Admitted']
x3 = data2[['SAT', 'Gender']]
L = sm.add_constant(x3)

In [57]:
# what we have is better model judging from log-likelihood 

In [58]:
# using the method pred_table we can summarize comparising of the predicticted and training data to determine the accuracy of
# the model.

In [59]:
result3.pred_table()

array([[69.,  5.],
       [ 4., 90.]])

In [60]:
# This table is called the confusion matrix, we can use this to calculate the accuracy of the model

In [73]:
def confusion_matrix(data, actual_values,model):
    pred_values = model.predict(data)
    bins=np.array([0,0.5,1])
    cm = np.histogram2d(actual_values, pred_values, bins=bins)[0]
    accuracy = (cm[0,0]+cm[1,1])/cm.sum()
    return cm, accuracy

In [74]:
test = result.predict()
test

array([0.34444444, 0.80769231, 0.80769231, 0.34444444, 0.34444444,
       0.80769231, 0.80769231, 0.80769231, 0.80769231, 0.34444444,
       0.80769231, 0.80769231, 0.80769231, 0.34444444, 0.34444444,
       0.80769231, 0.34444444, 0.34444444, 0.80769231, 0.80769231,
       0.80769231, 0.34444444, 0.80769231, 0.80769231, 0.80769231,
       0.34444444, 0.80769231, 0.34444444, 0.80769231, 0.80769231,
       0.34444444, 0.80769231, 0.34444444, 0.34444444, 0.34444444,
       0.34444444, 0.34444444, 0.80769231, 0.34444444, 0.80769231,
       0.34444444, 0.80769231, 0.34444444, 0.34444444, 0.34444444,
       0.34444444, 0.80769231, 0.34444444, 0.34444444, 0.34444444,
       0.34444444, 0.34444444, 0.80769231, 0.80769231, 0.80769231,
       0.34444444, 0.34444444, 0.80769231, 0.34444444, 0.80769231,
       0.34444444, 0.34444444, 0.80769231, 0.34444444, 0.34444444,
       0.34444444, 0.34444444, 0.34444444, 0.80769231, 0.34444444,
       0.34444444, 0.34444444, 0.80769231, 0.34444444, 0.34444

In [76]:
cm = confusion_matrix(L,test,result3)
cm

(array([[59., 31.],
        [14., 64.]]),
 0.7321428571428571)

In [None]:
# This new table contains the comfusion matrix and the model accuracy