In [1]:
!pip install numpy pandas scikit-learn xgboost

Collecting numpy
  Obtaining dependency information for numpy from https://files.pythonhosted.org/packages/8b/09/4ffb4d6cfe7ca6707336187951992bd8a8b9142cf345d87ab858d2d7636a/numpy-2.2.5-cp312-cp312-win_amd64.whl.metadata
  Downloading numpy-2.2.5-cp312-cp312-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.8 kB ? eta -:--:--
     ------------ ------------------------- 20.5/60.8 kB 640.0 kB/s eta 0:00:01
     ------------------------- ------------ 41.0/60.8 kB 653.6 kB/s eta 0:00:01
     -------------------------------------- 60.8/60.8 kB 648.8 kB/s eta 0:00:00
Collecting pandas
  Obtaining dependency information for pandas from https://files.pythonhosted.org/packages/29/d4/1244ab8edf173a10fd601f7e13b9566c1b525c4f365d6bee918e68381889/pandas-2.2.3-cp312-cp312-win_amd64.whl.metadata
  Using cached pandas-2.2.3-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonho


[notice] A new release of pip is available: 23.2.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import warnings

try:
    df = pd.read_csv('/content/personalized_learning_dataset.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: personalized_learning_dataset.csv not found.")
    exit()

df = df.drop('Student_ID', axis=1)

if 'Final_Exam_Score' in df.columns:
    df = df.drop('Final_Exam_Score', axis=1)
    print("Dropped 'Final_Exam_Score' to prevent data leakage.")
else:
    print("'Final_Exam_Score' column not found.")

df['Dropout_Likelihood'] = df['Dropout_Likelihood'].map({'No': 0, 'Yes': 1})

X = df.drop('Dropout_Likelihood', axis=1)
y = df['Dropout_Likelihood']

catFeatures = X.select_dtypes(include=['object', 'category']).columns.tolist()
numFeatures = X.select_dtypes(include=np.number).columns.tolist()

print(f"\nNumerical features: {numFeatures}")
print(f"Categorical features: {catFeatures}")

XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
print(f"\nData split: {len(XTrain)} training samples, {len(XTest)} testing samples.")
print(f"Dropout rate in training set: {yTrain.mean():.2%}")
print(f"Dropout rate in testing set: {yTest.mean():.2%}")

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numFeatures),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), catFeatures)
    ],
    remainder='passthrough'
)

models = {
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000, solver='liblinear'),
    "Random Forest": RandomForestClassifier(random_state=42, n_estimators=100),
    "XGBoost": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
}

results = {}

print("\n--- Training and Evaluating Models ---")

for name, model in models.items():
    print(f"\nTraining {name}...")
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
    pipeline.fit(XTrain, yTrain)
    yPred = pipeline.predict(XTest)
    yProba = pipeline.predict_proba(XTest)[:, 1]

    acc = accuracy_score(yTest, yPred)
    prec = precision_score(yTest, yPred)
    rec = recall_score(yTest, yPred)
    f1 = f1_score(yTest, yPred)
    auc = roc_auc_score(yTest, yProba)

    results[name] = {
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1-Score': f1,
        'AUC-ROC': auc
    }

    print(f"{name} Evaluation:")
    print(f"  Accuracy:  {acc:.4f}")
    print(f"  Precision: {prec:.4f}")
    print(f"  Recall:    {rec:.4f}")
    print(f"  F1-Score:  {f1:.4f}")
    print(f"  AUC-ROC:   {auc:.4f}")

print("\n--- Model Comparison ---")
resultsDf = pd.DataFrame(results).T
print(resultsDf)

bestAccModel = resultsDf['Accuracy'].idxmax()
bestF1Model = resultsDf['F1-Score'].idxmax()
bestAucModel = resultsDf['AUC-ROC'].idxmax()

print(f"\nBest Model based on Accuracy:  {bestAccModel} (Accuracy: {resultsDf.loc[bestAccModel, 'Accuracy']:.4f})")
print(f"Best Model based on F1-Score:  {bestF1Model} (F1-Score: {resultsDf.loc[bestF1Model, 'F1-Score']:.4f})")
print(f"Best Model based on AUC-ROC:   {bestAucModel} (AUC-ROC: {resultsDf.loc[bestAucModel, 'AUC-ROC']:.4f})")

print("\nRecommendation:")
if bestF1Model == bestAucModel:
    print(f"{bestF1Model} appears to be the best overall.")
elif bestAccModel in [bestF1Model, bestAucModel]:
    print(f"{bestAccModel} performed best on accuracy, but consider {bestF1Model} or {bestAucModel} for balanced performance.")
else:
    print(f"{bestAccModel} leads in accuracy, {bestF1Model} in F1, and {bestAucModel} in AUC. Prefer {bestF1Model} or {bestAucModel} for balanced results.")

print("\nNote: Hyperparameter tuning might further improve model performance.")


Error: personalized_learning_dataset.csv not found.


NameError: name 'df' is not defined

: 

In [None]:
# Install matplotlib if not already installed
%pip install matplotlib

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Assuming 'pipeline' is your trained pipeline
y_pred = pipeline.predict(XTest)

cm = confusion_matrix(yTest, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=pipeline.classes_)
disp.plot()
plt.title('Confusion Matrix')
plt.show()

Collecting matplotlib
  Using cached matplotlib-3.10.1-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Using cached contourpy-1.3.2-cp312-cp312-win_amd64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Using cached fonttools-4.57.0-cp312-cp312-win_amd64.whl.metadata (104 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Using cached kiwisolver-1.4.8-cp312-cp312-win_amd64.whl.metadata (6.3 kB)
Collecting pillow>=8 (from matplotlib)
  Using cached pillow-11.2.1-cp312-cp312-win_amd64.whl.metadata (9.1 kB)
Collecting pyparsing>=2.3.1 (from matplotlib)
  Using cached pyparsing-3.2.3-py3-none-any.whl.metadata (5.0 kB)
Using cached matplotlib-3.10.1-cp312-cp312-win_amd64.whl (8.1 MB)
Using cached contourpy-1.3.2-cp312-cp312-win_amd64.whl (223 kB)
Using cached cycler-0.12.1-py3-none-any.whl (8.3 kB)
Using cached fonttools

KeyboardInterrupt: 

In [None]:
import pandas as pd

# Assuming 'df' is your DataFrame and 'Dropout_Likelihood' is your target variable column
class_counts = df['Dropout_Likelihood'].value_counts()

print(class_counts)

Dropout_Likelihood
0    8043
1    1957
Name: count, dtype: int64
