In [1]:
import pandas as pd
import numpy as np
from statsmodels.api import Logit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [2]:
df = pd.read_csv('iris.csv')
df.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [8]:
df.shape

(150, 5)

In [6]:
df['Species'].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [9]:
df['is_setosa'] = (df['Species'] == 'setosa') + 0

In [10]:
df.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species,is_setosa
0,5.1,3.5,1.4,0.2,setosa,1
1,4.9,3.0,1.4,0.2,setosa,1
2,4.7,3.2,1.3,0.2,setosa,1
3,4.6,3.1,1.5,0.2,setosa,1
4,5.0,3.6,1.4,0.2,setosa,1


In [11]:
df.tail()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species,is_setosa
145,6.7,3.0,5.2,2.3,virginica,0
146,6.3,2.5,5.0,1.9,virginica,0
147,6.5,3.0,5.2,2.0,virginica,0
148,6.2,3.4,5.4,2.3,virginica,0
149,5.9,3.0,5.1,1.8,virginica,0


In [12]:
df.iloc[:, :2].head()

Unnamed: 0,Sepal.Length,Sepal.Width
0,5.1,3.5
1,4.9,3.0
2,4.7,3.2
3,4.6,3.1
4,5.0,3.6


In [13]:
model = Logit(endog=df['is_setosa'], exog=df.iloc[:,:2]).fit()

Optimization terminated successfully.
         Current function value: 0.036374
         Iterations 11


In [14]:
model.summary()

0,1,2,3
Dep. Variable:,is_setosa,No. Observations:,150.0
Model:,Logit,Df Residuals:,148.0
Method:,MLE,Df Model:,1.0
Date:,"Wed, 12 Nov 2025",Pseudo R-squ.:,0.9429
Time:,09:17:29,Log-Likelihood:,-5.456
converged:,True,LL-Null:,-95.477
Covariance Type:,nonrobust,LLR p-value:,4.745e-41

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Sepal.Length,-7.5299,2.252,-3.343,0.001,-11.945,-3.115
Sepal.Width,13.1307,3.987,3.294,0.001,5.317,20.945


In [15]:
model.params

Sepal.Length    -7.529945
Sepal.Width     13.130734
dtype: float64

In [16]:
model.pvalues

Sepal.Length    0.000828
Sepal.Width     0.000989
dtype: float64

In [17]:
pred = model.predict(df.iloc[:3, :2])

In [18]:
pred

0    0.999477
1    0.923824
2    0.998678
dtype: float64

In [20]:
(pred > 0.5) + 0

0    1
1    1
2    1
dtype: int64

In [21]:
model_2 = LogisticRegression(random_state=123)

In [22]:
model_2.fit(X=df.iloc[:, :2], y=df['is_setosa'])

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,123
,solver,'lbfgs'
,max_iter,100


In [24]:
model_2.coef_

array([[-3.38828053,  3.1645602 ]])

In [27]:
model_2.intercept_

array([8.32315487])

In [30]:
model_2.predict_proba(df.iloc[:3, :2])

array([[0.10727482, 0.89272518],
       [0.22894801, 0.77105199],
       [0.0741358 , 0.9258642 ]])

In [31]:
pred = model_2.predict_proba(df.iloc[:3, :2])

In [32]:
pred = pred[:, 1]

In [33]:
pred

array([0.89272518, 0.77105199, 0.9258642 ])

In [36]:
(pred > 0.5) + 0

array([1, 1, 1])

In [40]:
pred = model_2.predict_proba(X=df.iloc[:, :2])
pred = pred[:, 1]
pred[:10]

array([0.89272518, 0.77105199, 0.9258642 , 0.92738525, 0.9412639 ,
       0.91437197, 0.97058999, 0.89484894, 0.93034138, 0.82211118])

In [41]:
from sklearn.metrics import roc_auc_score

In [43]:
roc_auc_score(y_true=df['is_setosa'], y_score=pred)

0.9999999999999999

In [46]:
accuracy_score(y_true=df['is_setosa'], y_pred= ((pred > 0.8) + 0))

0.9466666666666667

# Q1

In [47]:
import pandas as pd

In [48]:
df_q1 = pd.read_csv('diabetes.csv')

In [49]:
df_q1.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [50]:
from sklearn.model_selection import train_test_split

In [51]:
df_q1_sub = df_q1.loc[:, ['BloodPressure', 'Glucose', 'BMI', 'Insulin', 'Outcome']]

In [52]:
df_q1_sub.head()

Unnamed: 0,BloodPressure,Glucose,BMI,Insulin,Outcome
0,72,148,33.6,0,1
1,66,85,26.6,0,0
2,64,183,23.3,0,1
3,66,89,28.1,94,0
4,40,137,43.1,168,1


In [53]:
df_q1_train, df_q1_test = train_test_split(df_q1_sub, train_size=0.8, random_state=123)

In [54]:
from statsmodels.api import Logit

In [55]:
model_q1 = Logit(endog=df_q1_train['Outcome'], exog=df_q1_train.iloc[:, :4]).fit()

Optimization terminated successfully.
         Current function value: 0.626579
         Iterations 5


In [56]:
model_q1.summary()

0,1,2,3
Dep. Variable:,Outcome,No. Observations:,614.0
Model:,Logit,Df Residuals:,610.0
Method:,MLE,Df Model:,3.0
Date:,"Wed, 12 Nov 2025",Pseudo R-squ.:,0.02458
Time:,09:55:18,Log-Likelihood:,-384.72
converged:,True,LL-Null:,-394.41
Covariance Type:,nonrobust,LLR p-value:,0.0002275

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
BloodPressure,-0.0265,0.005,-5.569,0.000,-0.036,-0.017
Glucose,0.0114,0.003,4.206,0.000,0.006,0.017
BMI,-0.0041,0.011,-0.376,0.707,-0.025,0.017
Insulin,0.0008,0.001,1.057,0.290,-0.001,0.002


In [64]:
pred_q1 = model_q1.predict(exog=df_q1_test.iloc[:,:4])
pred_q1 = (pred_q1 > 0.5) + 0
pred_q1

236    0
395    1
36     0
210    0
483    0
      ..
650    0
579    1
119    0
593    0
310    0
Length: 154, dtype: int64

In [60]:
from sklearn.metrics import accuracy_score

In [69]:
round(accuracy_score(y_true=df_q1_test['Outcome'], y_pred=pred_q1), 2)

0.7

# Q2

In [71]:
import pandas as pd

In [72]:
df_q2 = pd.read_csv('diabetes.csv')
df_q2.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [73]:
df_q2_sub = df_q2.loc[:, ['Glucose', 'BMI', 'Age', 'Outcome']]

In [74]:
df_q2_sub.head()

Unnamed: 0,Glucose,BMI,Age,Outcome
0,148,33.6,50,1
1,85,26.6,31,0
2,183,23.3,32,1
3,89,28.1,21,0
4,137,43.1,33,1


In [75]:
from statsmodels.api import Logit

In [77]:
model = Logit(endog=df_q2_sub['Outcome'], exog=df_q2_sub.iloc[:, :3]).fit()

Optimization terminated successfully.
         Current function value: 0.656276
         Iterations 4


In [80]:
model.summary()

0,1,2,3
Dep. Variable:,Outcome,No. Observations:,768.0
Model:,Logit,Df Residuals:,765.0
Method:,MLE,Df Model:,2.0
Date:,"Wed, 12 Nov 2025",Pseudo R-squ.:,-0.01465
Time:,10:27:22,Log-Likelihood:,-504.02
converged:,True,LL-Null:,-496.74
Covariance Type:,nonrobust,LLR p-value:,1.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Glucose,0.0094,0.002,4.011,0.000,0.005,0.014
BMI,-0.0356,0.008,-4.447,0.000,-0.051,-0.020
Age,-0.0129,0.006,-2.056,0.040,-0.025,-0.001


In [78]:
pred = model.predict(exog=df_q2_sub.iloc[:, :3])

In [79]:
pred

0      0.387961
1      0.365506
2      0.615678
3      0.392087
4      0.336654
         ...   
763    0.261357
764    0.373590
765    0.453351
766    0.377879
767    0.375465
Length: 768, dtype: float64

In [81]:
model.params

Glucose    0.009368
BMI       -0.035639
Age       -0.012898
dtype: float64

In [82]:
import numpy as np

In [85]:
np.exp(model.params).round(2)

Glucose    1.01
BMI        0.96
Age        0.99
dtype: float64

In [86]:
model.pvalues

Glucose    0.000060
BMI        0.000009
Age        0.039770
dtype: float64

# Q3

In [87]:
import pandas as pd

In [88]:
df_q3 = pd.read_csv('diabetes.csv')
df_q3.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [89]:
df_q3_sub = df_q3.loc[:, ['Glucose', 'BMI', 'Age', 'Outcome']]

In [90]:
from statsmodels.api import Logit

In [96]:
model_q3 = Logit(endog=df_q3_sub['Outcome'], exog=df_q3_sub.iloc[:, :3]).fit()

Optimization terminated successfully.
         Current function value: 0.656276
         Iterations 4


In [98]:
pred_q3 = model_q3.predict(exog=df_q3_sub.iloc[:, :3])
pred_q3.head()

0    0.387961
1    0.365506
2    0.615678
3    0.392087
4    0.336654
dtype: float64

In [93]:
from sklearn.metrics import roc_auc_score

In [101]:
round(roc_auc_score(y_true=df_q3_sub['Outcome'], y_score=pred_q3), 2)

0.54