In [8]:
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [9]:
df = pd.read_csv("data.csv", delimiter=";")
print(df.head())

   Marital status  Application mode  Application order  Course  \
0               1                17                  5     171   
1               1                15                  1    9254   
2               1                 1                  5    9070   
3               1                17                  2    9773   
4               2                39                  1    8014   

   Daytime/evening attendance\t  Previous qualification  \
0                             1                       1   
1                             1                       1   
2                             1                       1   
3                             1                       1   
4                             0                       1   

   Previous qualification (grade)  Nacionality  Mother's qualification  \
0                           122.0            1                      19   
1                           160.0            1                       1   
2                         

In [10]:
# Preprocessing: Handle missing values
df = df.dropna()  # Drop rows with missing values



In [15]:
# Load the label encoder
label_encoders = joblib.load("label_encoders.pkl")

In [16]:
# Encode categorical columns
for col in df.select_dtypes(include=['object']).columns:
    le = label_encoders.get(col)
    if le:
        df[col] = le.transform(df[col])  # Transform with the saved encoder

# Separate features and target variable
X = df.drop(columns=["Target"])  # Features
y = df["Target"]  # Target variable

# Scale numerical features for better performance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [17]:
# Train a Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

#  Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))

# Save the trained model and scaler for later deployment
joblib.dump(model, "model.pkl")
joblib.dump(scaler, "scaler.pkl")

print("Model training complete and saved successfully!")

Model Accuracy: 0.76
              precision    recall  f1-score   support

     Dropout       0.84      0.77      0.81       316
    Enrolled       0.49      0.29      0.37       151
    Graduate       0.76      0.92      0.83       418

    accuracy                           0.76       885
   macro avg       0.70      0.66      0.67       885
weighted avg       0.74      0.76      0.74       885

Model training complete and saved successfully!
