In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import statsmodels.api as sm

In [3]:
raw_data = pd.read_csv('2.02. Binary predictors.csv')
raw_data

Unnamed: 0,SAT,Admitted,Gender
0,1363,No,Male
1,1792,Yes,Female
2,1954,Yes,Female
3,1653,No,Male
4,1593,No,Male
...,...,...,...
163,1722,Yes,Female
164,1750,Yes,Male
165,1555,No,Male
166,1524,No,Male


In [4]:
data = raw_data.copy()
data['Admitted'] = data["Admitted"].map({'Yes': 1, 'No': 0})
data['Gender'] = data["Gender"].map({'Female': 1, 'Male': 0})   # Note that it is common for male to be represented as 0
data

Unnamed: 0,SAT,Admitted,Gender
0,1363,0,0
1,1792,1,1
2,1954,1,1
3,1653,0,0
4,1593,0,0
...,...,...,...
163,1722,1,1
164,1750,1,0
165,1555,0,0
166,1524,0,0


In [5]:
y = data['Admitted']
x1 = data[['SAT', 'Gender']]

In [6]:
x =sm.add_constant(x1)
regLog = sm.Logit(y, x)
resultsLog = regLog.fit()
resultsLog.summary()

Optimization terminated successfully.
         Current function value: 0.120117
         Iterations 10


0,1,2,3
Dep. Variable:,Admitted,No. Observations:,168.0
Model:,Logit,Df Residuals:,165.0
Method:,MLE,Df Model:,2.0
Date:,"Wed, 18 Jun 2025",Pseudo R-squ.:,0.8249
Time:,16:43:53,Log-Likelihood:,-20.18
converged:,True,LL-Null:,-115.26
Covariance Type:,nonrobust,LLR p-value:,5.1180000000000006e-42

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-68.3489,16.454,-4.154,0.000,-100.598,-36.100
SAT,0.0406,0.010,4.129,0.000,0.021,0.060
Gender,1.9449,0.846,2.299,0.022,0.287,3.603


In [7]:
np.exp(1.9449)

6.992932526814459

With this, we can see that, for the same SAT scores, females have a 7% better chance at being admitted to university than males. For the full mathematical reasoning, check Lecture 246, screenshots aren't working right now for some reason.

## Accuracy

In [8]:
resultsLog.pred_table()

array([[69.,  5.],
       [ 4., 90.]])

In [9]:
confusionMatrix = pd.DataFrame(resultsLog.pred_table())
confusionMatrix.columns = ['Predicted 0', 'Predicted 1']
confusionMatrix = confusionMatrix.rename(index={0: 'Actual 0', 1: 'Actual 1'})
confusionMatrix

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,69.0,5.0
Actual 1,4.0,90.0


For 69 observations, we actually got 0 correct. We also got the correct prediction of 1 90 times. In a total of 9 cases, the model was wrong. Hence, our accuracy is 159/168.

## Testing the model

In [10]:
test = pd.read_csv('2.03. Test dataset.csv')
test

Unnamed: 0,SAT,Admitted,Gender
0,1323,No,Male
1,1725,Yes,Female
2,1762,Yes,Female
3,1777,Yes,Male
4,1665,No,Male
5,1556,Yes,Female
6,1731,Yes,Female
7,1809,Yes,Female
8,1930,Yes,Female
9,1708,Yes,Male


In [11]:
test['Admitted'] = test["Admitted"].map({'Yes': 1, 'No': 0})
test['Gender'] = test["Gender"].map({'Female': 1, 'Male': 0})
test

Unnamed: 0,SAT,Admitted,Gender
0,1323,0,0
1,1725,1,1
2,1762,1,1
3,1777,1,0
4,1665,0,0
5,1556,1,1
6,1731,1,1
7,1809,1,1
8,1930,1,1
9,1708,1,0


In [12]:
x

Unnamed: 0,const,SAT,Gender
0,1.0,1363,0
1,1.0,1792,1
2,1.0,1954,1
3,1.0,1653,0
4,1.0,1593,0
...,...,...,...
163,1.0,1722,1
164,1.0,1750,0
165,1.0,1555,0
166,1.0,1524,0


In [13]:
test_actual = test['Admitted']
test_data = test.drop(['Admitted'], axis=1)
test_data = sm.add_constant(test_data)
test_data

Unnamed: 0,const,SAT,Gender
0,1.0,1323,0
1,1.0,1725,1
2,1.0,1762,1
3,1.0,1777,0
4,1.0,1665,0
5,1.0,1556,1
6,1.0,1731,1
7,1.0,1809,1
8,1.0,1930,1
9,1.0,1708,0


In [15]:
def confusionMatrix(data, actual_values, model):
    pred_values = model.predict(data)
    bins = np.array([0, 0.5, 1])    # The bins are used to set boundaries;
    # If prediction falls between 0 and 0.5, its considered 0, and if it falls between 0.5 and 1, its 1.
    cm = np.histogram2d(actual_values, pred_values, bins=bins)[0]
    accuracy = (cm[0, 0] + cm[1,1]) / cm.sum()
    return cm, accuracy

In [21]:
cm2 = confusionMatrix(test_data, test_actual, resultsLog)
cm2

(array([[ 5.,  1.],
        [ 1., 12.]]),
 0.8947368421052632)

We can see the accuracy is 89.47%, which is lower than the training accuracy. This is because of overfitting, but this is still really accurate, so the model is not actually overfitted.

In [20]:
cm2_df = pd.DataFrame(cm2[0])
cm2_df.columns = ['Predicted 0', 'Predicted 1']
cm2_df = cm2_df.rename(index={0: 'Actual 0', 1: 'Actual 1'})
cm2_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5.0,1.0
Actual 1,1.0,12.0
