In [1]:

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


from sklearn.metrics import(confusion_matrix, 
accuracy_score, precision_score, recall_score,f1_score,roc_auc_score)

In [2]:
np.random.seed(42)
n = 500

In [3]:
data = pd.DataFrame({
    "attendance_rate":np.random.normal(75,10,n).clip(40,100),
    "avg_marks":np.random.normal(80,40,n).clip(30,100),
    "assignments_submitted":np.random.normal(80,15,n).clip(20,100),
    "disciplinary_cases":np.random.poisson(1.2,n)
})

In [4]:
data

Unnamed: 0,attendance_rate,avg_marks,assignments_submitted,disciplinary_cases
0,79.967142,100.000000,100.000000,0
1,73.617357,100.000000,93.869505,2
2,81.476885,30.000000,80.894456,0
3,90.230299,100.000000,70.295948,1
4,72.658466,53.974297,90.473350,2
...,...,...,...,...
495,80.389100,68.755988,100.000000,2
496,64.627538,100.000000,100.000000,0
497,73.096613,100.000000,98.125493,0
498,66.243817,57.152840,95.360938,2


In [5]:
data["dropout"] =(
    (data["attendance_rate"]<60) |
    (data["avg_marks"]<50) |
    (data["disciplinary_cases"]>3)
).astype(int)

In [6]:
data

Unnamed: 0,attendance_rate,avg_marks,assignments_submitted,disciplinary_cases,dropout
0,79.967142,100.000000,100.000000,0,0
1,73.617357,100.000000,93.869505,2,0
2,81.476885,30.000000,80.894456,0,1
3,90.230299,100.000000,70.295948,1,0
4,72.658466,53.974297,90.473350,2,0
...,...,...,...,...,...
495,80.389100,68.755988,100.000000,2,0
496,64.627538,100.000000,100.000000,0,0
497,73.096613,100.000000,98.125493,0,0
498,66.243817,57.152840,95.360938,2,0


In [7]:
X = data.drop("dropout", axis=1)

In [None]:
X

Unnamed: 0,attendance_rate,avg_marks,assignments_submitted,disciplinary_cases
0,79.967142,100.000000,100.000000,0
1,73.617357,100.000000,93.869505,2
2,81.476885,30.000000,80.894456,0
3,90.230299,100.000000,70.295948,1
4,72.658466,53.974297,90.473350,2
...,...,...,...,...
495,80.389100,68.755988,100.000000,2
496,64.627538,100.000000,100.000000,0
497,73.096613,100.000000,98.125493,0
498,66.243817,57.152840,95.360938,2


In [9]:
y = data["dropout"]

In [10]:
y

0      0
1      0
2      1
3      0
4      0
      ..
495    0
496    0
497    0
498    0
499    0
Name: dropout, Length: 500, dtype: int64

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [12]:
X_train

Unnamed: 0,attendance_rate,avg_marks,assignments_submitted,disciplinary_cases
227,64.222552,93.903268,77.794970,4
417,76.156746,87.464365,84.028884,0
203,85.538021,30.000000,88.149470,1
126,65.094637,40.415807,84.539532,2
329,81.283455,68.728616,88.998931,1
...,...,...,...,...
106,93.861859,49.048432,89.191609,1
270,89.412733,100.000000,96.369652,1
348,67.815558,54.490401,59.496820,2
435,75.740948,59.917831,84.238699,0


In [13]:
X_test

Unnamed: 0,attendance_rate,avg_marks,assignments_submitted,disciplinary_cases
361,90.327389,30.000000,88.689372,1
73,90.646437,71.235979,60.755117,2
374,96.531825,78.674921,100.000000,1
155,67.856486,82.368737,95.147256,0
104,73.387143,96.537396,96.622742,0
...,...,...,...,...
220,98.146586,92.312071,62.186024,0
176,75.130019,35.576966,54.605648,2
320,75.969960,59.308462,80.849749,0
153,77.322537,35.738964,60.153653,1


In [14]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [15]:
X_train_scaled

array([[-1.0983454 ,  0.75933133, -0.19133222,  2.40723973],
       [ 0.11235364,  0.49741138,  0.25852556, -1.06474065],
       [ 1.06406428, -1.84010839,  0.55587931, -0.19674555],
       ...,
       [-0.7338425 , -0.84389455, -1.51178115,  0.67124954],
       [ 0.07017174, -0.62311902,  0.27366644, -1.06474065],
       [-0.35267257,  1.00733252,  1.41104875,  1.53924463]],
      shape=(375, 4))

In [16]:
models = {
    "Logistic Regression":LogisticRegression(max_iter=1000),
    "Decision Tree":DecisionTreeClassifier(max_depth=5),
    "Random Forest":RandomForestClassifier(n_estimators=100),
    "SVM":SVC(probability=True)
}

In [17]:
result=[]

for name, model in models.items():
    if name in["Logistic Regression","SVM"]:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        y_prob = model.predict_proba(X_test_scaled)[:1]
    else:
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:1]



In [18]:
cm =  confusion_matrix(y_test,y_pred)

In [19]:
result.append({
    "Model": name,
    "Accuracy":accuracy_score(y_test,y_pred),
    "Precision":precision_score(y_test,y_pred),
    "Recall":recall_score(y_test,y_pred),
    "FI-Score":f1_score(y_test,y_pred),
    "ROC-AUC":roc_auc_score(y_test,y_pred),
    "Confusion Matrix":cm
})

In [20]:
result_df = pd.DataFrame(result)
print(result_df)

  Model  Accuracy  Precision    Recall  FI-Score   ROC-AUC    Confusion Matrix
0   SVM     0.944        1.0  0.810811  0.895522  0.905405  [[88, 0], [7, 30]]


In [21]:
for r in result:
    print(f"\n{r['Model']}Confusion Matrix:")
    print(r["Confusion Matrix"])


SVMConfusion Matrix:
[[88  0]
 [ 7 30]]
