In [25]:
import numpy as np 
import pandas as pd 
 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC




from sklearn.metrics import(confusion_matrix, accuracy_score,precision_score, recall_score, f1_score, roc_auc_score )


In [26]:
np.random.seed(42)
n = 500


In [27]:
data = pd.DataFrame({
    "attendance_rate":np.random.normal(75,10,n).clip(40,100),
    "avg_marks":np.random.normal(80,40,n).clip(30,100),
    "assignment_submitted":np.random.normal(80,15,n).clip(20,100),
    "disciplinary_cases":np.random.poisson(1.2,n)
})

In [28]:
data

Unnamed: 0,attendance_rate,avg_marks,assignment_submitted,disciplinary_cases
0,79.967142,100.000000,100.000000,0
1,73.617357,100.000000,93.869505,2
2,81.476885,30.000000,80.894456,0
3,90.230299,100.000000,70.295948,1
4,72.658466,53.974297,90.473350,2
...,...,...,...,...
495,80.389100,68.755988,100.000000,2
496,64.627538,100.000000,100.000000,0
497,73.096613,100.000000,98.125493,0
498,66.243817,57.152840,95.360938,2


In [29]:
data["dropout"] = (
    (data["attendance_rate"]<60) |
    (data["avg_marks"]<50) |
    (data["disciplinary_cases"]>3)
)
data

Unnamed: 0,attendance_rate,avg_marks,assignment_submitted,disciplinary_cases,dropout
0,79.967142,100.000000,100.000000,0,False
1,73.617357,100.000000,93.869505,2,False
2,81.476885,30.000000,80.894456,0,True
3,90.230299,100.000000,70.295948,1,False
4,72.658466,53.974297,90.473350,2,False
...,...,...,...,...,...
495,80.389100,68.755988,100.000000,2,False
496,64.627538,100.000000,100.000000,0,False
497,73.096613,100.000000,98.125493,0,False
498,66.243817,57.152840,95.360938,2,False


In [30]:
x = data.drop("dropout",axis = 1)
data

Unnamed: 0,attendance_rate,avg_marks,assignment_submitted,disciplinary_cases,dropout
0,79.967142,100.000000,100.000000,0,False
1,73.617357,100.000000,93.869505,2,False
2,81.476885,30.000000,80.894456,0,True
3,90.230299,100.000000,70.295948,1,False
4,72.658466,53.974297,90.473350,2,False
...,...,...,...,...,...
495,80.389100,68.755988,100.000000,2,False
496,64.627538,100.000000,100.000000,0,False
497,73.096613,100.000000,98.125493,0,False
498,66.243817,57.152840,95.360938,2,False


In [31]:
y = data["dropout"]
data

Unnamed: 0,attendance_rate,avg_marks,assignment_submitted,disciplinary_cases,dropout
0,79.967142,100.000000,100.000000,0,False
1,73.617357,100.000000,93.869505,2,False
2,81.476885,30.000000,80.894456,0,True
3,90.230299,100.000000,70.295948,1,False
4,72.658466,53.974297,90.473350,2,False
...,...,...,...,...,...
495,80.389100,68.755988,100.000000,2,False
496,64.627538,100.000000,100.000000,0,False
497,73.096613,100.000000,98.125493,0,False
498,66.243817,57.152840,95.360938,2,False


In [32]:
x_train=x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=41)
x_train

Unnamed: 0,attendance_rate,avg_marks,assignment_submitted,disciplinary_cases
296,83.995999,78.417794,93.786147,1
467,82.575077,100.000000,85.920073,4
336,66.745028,100.000000,70.798960,1
322,66.817793,100.000000,78.942518,2
143,76.846339,30.000000,81.883647,1
...,...,...,...,...
80,72.803281,93.539856,74.770218,1
482,58.935537,41.320954,100.000000,2
396,57.868655,73.315277,59.793106,6
419,75.675185,87.763600,87.617536,1


In [33]:
x_test =x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=41)
x_test

Unnamed: 0,attendance_rate,avg_marks,assignment_submitted,disciplinary_cases
196,66.161426,93.045321,79.705433,1
280,76.135173,87.108040,82.003114,2
388,84.504238,100.000000,80.480062,1
379,66.917017,100.000000,100.000000,0
335,73.529426,89.368589,74.943710,1
...,...,...,...,...
232,75.455718,96.330110,78.675769,2
291,70.069991,88.307507,100.000000,0
137,71.779385,82.932719,61.585883,1
56,66.607825,60.495751,79.616689,0


In [34]:
scalar = StandardScaler()
x_train_scaled = scalar.fit_transform(x_train)
x_test_scaled = scalar.fit_transform(x_test)
x

Unnamed: 0,attendance_rate,avg_marks,assignment_submitted,disciplinary_cases
0,79.967142,100.000000,100.000000,0
1,73.617357,100.000000,93.869505,2
2,81.476885,30.000000,80.894456,0
3,90.230299,100.000000,70.295948,1
4,72.658466,53.974297,90.473350,2
...,...,...,...,...
495,80.389100,68.755988,100.000000,2
496,64.627538,100.000000,100.000000,0
497,73.096613,100.000000,98.125493,0
498,66.243817,57.152840,95.360938,2


In [35]:
x_train_scaled

array([[ 0.90114639,  0.19497553,  0.93614465, -0.15358103],
       [ 0.75347482,  1.05660719,  0.36504504,  2.54608553],
       [-0.89168841,  1.05660719, -0.73279123, -0.15358103],
       ...,
       [-1.81417975, -0.0087335 , -1.53184791,  4.34586323],
       [ 0.03639245,  0.56809041,  0.48828576, -0.15358103],
       [-0.56782165,  0.18534467,  1.28377439, -1.05346988]],
      shape=(375, 4))

In [36]:
models = {
    "Logistic Regression":LogisticRegression(max_iter=100),
    "Decision Tree":DecisionTreeClassifier(max_depth=5),
    "Random Forest":RandomForestClassifier(n_estimators=100),
    "Support Vector Machine":SVC(probability=True)
}


In [37]:
result=[]
for name,model in models.items():
  if name in ["Logistic Regression","Support Vector Machine"]:
    model.fit(x_train_scaled,y_train)
    y_pred=model.predict(x_test_scaled)
    y_prob=model.predict_proba(x_test_scaled)[:,1]
  else:
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)
    y_prob=model.predict_proba(x_test)[:,1]
result

[]

In [38]:
cm = confusion_matrix(y_test,y_pred)

In [39]:
result.append({
    "Model":name,
    "Accuracy":accuracy_score(y_test,y_pred),
    "Precision":precision_score(y_test,y_pred),
    "Recall":recall_score(y_test,y_pred),
    "F1 Score":f1_score(y_test,y_pred),
    "ROC AUC Score":roc_auc_score(y_test,y_prob),
    "Confusion Matrix":cm
})
result

[{'Model': 'Support Vector Machine',
  'Accuracy': 0.96,
  'Precision': 0.9142857142857143,
  'Recall': 0.9411764705882353,
  'F1 Score': 0.927536231884058,
  'ROC AUC Score': 0.9873949579831932,
  'Confusion Matrix': array([[88,  3],
         [ 2, 32]])}]

In [40]:
result_df=pd.DataFrame(result)
result_df


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC AUC Score,Confusion Matrix
0,Support Vector Machine,0.96,0.914286,0.941176,0.927536,0.987395,"[[88, 3], [2, 32]]"


In [41]:
for r in result:

  print(f"\nModel: {r['Model']}Confusion matrix")
  print(r["Confusion Matrix"])
  print


Model: Support Vector MachineConfusion matrix
[[88  3]
 [ 2 32]]
