In [None]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m41.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


**Synthetic Data**
1. Creation and Read
2. Adding Noise
3. Augmentation
4. Validation
5. Cleaning
6. Visualization
7. ***Modeling*** *is a process of converting a real time problem into a mathematical function that can learn patterns from data and make predictions on unseen data.*
8. ***Evaluation***

**Modeling pipeline**
1. Problem Definition
2. Feature Selection
3. Test-Train Split
4. Algorithm Selection
5. Model Training
6. Evaluation and Comparison
7. Final Model Selection

In [3]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


from sklearn.metrics import (accuracy_score, confusion_matrix,precision_score,recall_score,f1_score,roc_auc_score)


In [4]:
np.random.seed(41)
n=500


In [7]:
data=pd.DataFrame({
    'attendance_rate':np.random.normal(75,10,n).clip(40,100),
    'avg_marks':np.random.normal(80,40,n).clip(30,100),
    'assignments_submitted':np.random.normal(80,15,n).clip(20,100),
    'disciplinary_cases':np.random.poisson(1.2,n)
})
data

Unnamed: 0,attendance_rate,avg_marks,assignments_submitted,disciplinary_cases
0,81.833453,100.000000,64.976496,1
1,67.050270,49.217549,80.603346,2
2,81.210512,30.000000,79.004471,0
3,56.736692,30.000000,58.519475,3
4,68.994885,87.557292,84.267259,1
...,...,...,...,...
495,87.859943,100.000000,76.798137,0
496,69.401768,96.543050,73.362608,0
497,80.578119,100.000000,100.000000,1
498,79.392265,73.217405,93.460671,0


In [10]:
data["droput"]=(
    (data["attendance_rate"]<60)
    |(data["avg_marks"]<50)
    |(data["disciplinary_cases"]>3)
)
data

Unnamed: 0,attendance_rate,avg_marks,assignments_submitted,disciplinary_cases,droput
0,81.833453,100.000000,64.976496,1,False
1,67.050270,49.217549,80.603346,2,True
2,81.210512,30.000000,79.004471,0,True
3,56.736692,30.000000,58.519475,3,True
4,68.994885,87.557292,84.267259,1,False
...,...,...,...,...,...
495,87.859943,100.000000,76.798137,0,False
496,69.401768,96.543050,73.362608,0,False
497,80.578119,100.000000,100.000000,1,False
498,79.392265,73.217405,93.460671,0,False


In [23]:
x = data.drop("droput", axis=1)

x

Unnamed: 0,attendance_rate,avg_marks,assignments_submitted,disciplinary_cases
0,81.833453,100.000000,64.976496,1
1,67.050270,49.217549,80.603346,2
2,81.210512,30.000000,79.004471,0
3,56.736692,30.000000,58.519475,3
4,68.994885,87.557292,84.267259,1
...,...,...,...,...
495,87.859943,100.000000,76.798137,0
496,69.401768,96.543050,73.362608,0
497,80.578119,100.000000,100.000000,1
498,79.392265,73.217405,93.460671,0


In [21]:
y = data["droput"]
y

Unnamed: 0,droput
0,False
1,True
2,True
3,True
4,False
...,...
495,False
496,False
497,False
498,False


In [25]:
x_train=x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=41)
x_train

Unnamed: 0,attendance_rate,avg_marks,assignments_submitted,disciplinary_cases
296,77.055351,100.000000,87.154402,0
467,66.991958,100.000000,72.744736,1
336,83.635222,42.478097,79.650257,2
322,74.991320,60.482474,96.164173,0
143,66.287998,100.000000,71.530705,1
...,...,...,...,...
80,64.444508,84.182329,64.171751,1
482,92.006631,30.000000,81.547649,2
396,70.636240,78.594202,87.616249,0
419,77.180831,61.330493,60.122189,2


In [26]:
x_test =x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=41)
x_test

Unnamed: 0,attendance_rate,avg_marks,assignments_submitted,disciplinary_cases
196,67.252289,72.865738,71.878550,1
280,75.044495,94.442138,69.050930,0
388,68.257224,100.000000,82.230951,2
379,74.469597,75.424774,93.534572,2
335,71.902836,100.000000,86.540073,2
...,...,...,...,...
232,66.922134,44.358374,63.195780,2
291,76.785410,51.575228,69.050533,1
137,95.875263,92.593512,62.756576,0
56,69.795007,60.810845,83.623449,4


In [30]:
scalar = StandardScaler()
x_train_scaled = scalar.fit_transform(x_train)
x_test_scaled = scalar.fit_transform(x_test)
x

Unnamed: 0,attendance_rate,avg_marks,assignments_submitted,disciplinary_cases
0,81.833453,100.000000,64.976496,1
1,67.050270,49.217549,80.603346,2
2,81.210512,30.000000,79.004471,0
3,56.736692,30.000000,58.519475,3
4,68.994885,87.557292,84.267259,1
...,...,...,...,...
495,87.859943,100.000000,76.798137,0
496,69.401768,96.543050,73.362608,0
497,80.578119,100.000000,100.000000,1
498,79.392265,73.217405,93.460671,0


In [31]:
x_train_scaled

array([[ 0.18321483,  0.97077278,  0.66407911, -1.11613843],
       [-0.80053383,  0.97077278, -0.43001772, -0.14502031],
       [ 0.82643121, -1.28444867,  0.0943045 ,  0.82609781],
       ...,
       [-0.44428638,  0.13153047,  0.69914621, -1.11613843],
       [ 0.19548115, -0.54531584, -1.3884223 ,  0.82609781],
       [-0.101074  , -1.2658873 , -0.42360224, -1.11613843]])

In [33]:
models = {
    "Logistic Regression":LogisticRegression(max_iter=100),
    "Decision Tree":DecisionTreeClassifier(max_depth=5),
    "Random Forest":RandomForestClassifier(n_estimators=100),
    "Support Vector Machine":SVC(probability=True)
}


In [35]:
result=[]
for name,model in models.items():
  if name in ["Logistic Regression","Support Vector Machine"]:
    model.fit(x_train_scaled,y_train)
    y_pred=model.predict(x_test_scaled)
    y_prob=model.predict_proba(x_test_scaled)[:,1]
  else:
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)
    y_prob=model.predict_proba(x_test)[:,1]
result

[]

In [37]:
cm = confusion_matrix(y_test,y_pred)


In [38]:
result.append({
    "Model":name,
    "Accuracy":accuracy_score(y_test,y_pred),
    "Precision":precision_score(y_test,y_pred),
    "Recall":recall_score(y_test,y_pred),
    "F1 Score":f1_score(y_test,y_pred),
    "ROC AUC Score":roc_auc_score(y_test,y_prob),
    "Confusion Matrix":cm
})
result


[{'Model': 'Support Vector Machine',
  'Accuracy': 0.952,
  'Precision': 0.8529411764705882,
  'Recall': 0.9666666666666667,
  'F1 Score': 0.90625,
  'ROC AUC Score': np.float64(0.9943859649122808),
  'Confusion Matrix': array([[90,  5],
         [ 1, 29]])}]

In [39]:
result_df=pd.DataFrame(result)
result_df


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC AUC Score,Confusion Matrix
0,Support Vector Machine,0.952,0.852941,0.966667,0.90625,0.994386,"[[90, 5], [1, 29]]"


In [46]:
for r in result:

  print(f"\nModel: {r['Model']}Confusion matrix")
  print(r["Confusion Matrix"])
  print


Model: Support Vector MachineConfusion matrix
[[90  5]
 [ 1 29]]
