In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import json
from typing import Dict, Any, List
from google.colab import files

In [None]:
uploaded = files.upload()
file_name = list(uploaded.keys())[0]
df = pd.read_csv(file_name)
df.head()

Saving xAPI-Edu-Data.csv to xAPI-Edu-Data.csv


Unnamed: 0,StageID,GradeID,SectionID,Topic,Semester,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,StudentAbsenceDays,Class
0,lowerlevel,G-04,A,IT,F,15,16,2,20,Yes,Under-7,M
1,lowerlevel,G-04,A,IT,F,20,20,3,25,Yes,Under-7,M
2,lowerlevel,G-04,A,IT,F,10,7,0,30,No,Above-7,L
3,lowerlevel,G-04,A,IT,F,30,25,5,35,No,Above-7,L
4,lowerlevel,G-04,A,IT,F,40,50,12,50,No,Above-7,M


In [None]:
#define input columns , ouptu column
label_cols = [
    'StageID', 'GradeID','SectionID',
    'Topic', 'Semester', 'raisedhands',
    'VisITedResources','Discussion','AnnouncementsView',
    'StudentAbsenceDays'
]

target_col = "Class"
x = df[label_cols].copy()
y = df[target_col].copy()

In [None]:
for col in x.columns:
    if x[col].dtype == 'object':
       x[col] = LabelEncoder().fit_transform(x[col])

y = LabelEncoder().fit_transform(y)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.25, random_state= 42 , stratify = y)

In [None]:
num_classes = len(np.unique(y))
model = XGBClassifier( objective="multi:softprob",
    num_class=num_classes,
    eval_metric="mlogloss",
    learning_rate=0.1,
    max_depth= 11,
    n_estimators=200,
    random_state=42,
    enable_categorical=False
)

In [None]:
model.fit(x_train, y_train)

In [None]:
y_pred = model.predict(x_test)
model_accuracy = accuracy_score(y_test, y_pred)
print("The model Accuracy= ", model_accuracy *100)

The model Accuracy=  79.16666666666666


In [None]:
# 1. Re-initialize and fit LabelEncoders for all categorical features and the target.
encoders = {}
categorical_feature_cols = []
numeric_feature_cols = []

for col in label_cols:
    if df[col].dtype == 'object': # Identify categorical columns from the original df
        le = LabelEncoder()
        le.fit(df[col])
        encoders[col] = le
        categorical_feature_cols.append(col)
    else:
        numeric_feature_cols.append(col)

# Fit LabelEncoder for the target column
target_le = LabelEncoder()
target_le.fit(df[target_col])
encoders[target_col] = target_le

# Construct the model_dict using the trained model from previous steps
# (assuming 'model' variable is available from the training cell)
model_dict = {
    "model": model, # Use the already trained model
    "encoders": encoders,
    "target": target_col,
    "features": label_cols, # All features used for training
    "categorical_features": categorical_feature_cols,
    "numeric_features": numeric_feature_cols
}

def predict_student_performance(model_dict: Dict[str, Any], input_data: Dict[str, Any]) -> Dict[str, Any]:
    try:
        # 1. Validate input data
        required_features = model_dict["features"]
        missing_features = [f for f in required_features if f not in input_data]

        if missing_features:
            return {
                "status": "error",
                "message": f"Missing required features in input data: {', '.join(missing_features)}. Please provide values for all features: {', '.join(required_features)}"
            }

        # 2. Prepare input data
        # Convert dict to DataFrame, ensuring all required features are present
        input_df = pd.DataFrame([input_data])

        # Apply LabelEncoding to categorical features
        for col in model_dict["categorical_features"]:
            if col in input_df.columns and col in model_dict["encoders"]:
                le = model_dict["encoders"][col]
                # Check if the input value is known to the encoder
                if input_df[col].iloc[0] not in le.classes_:
                    return {
                        "status": "error",
                        "message": f"Unknown category '{input_df[col].iloc[0]}' for feature '{col}'. Known categories are: {', '.join(le.classes_)}"
                    }
                input_df[col] = le.transform(input_df[col])

        # Ensure the order of columns matches the training data (X)
        processed_input = input_df[required_features]

        # 3. Predict probabilities and class
        model_obj = model_dict["model"]
        probs_array = model_obj.predict_proba(processed_input)[0]
        pred_class_encoded = model_obj.predict(processed_input)[0]

        # 4. Decode predicted class and format probabilities
        target_encoder = model_dict["encoders"][model_dict["target"]]
        decoded_predicted_class = target_encoder.inverse_transform([pred_class_encoded])[0]

        # Get class names from the target encoder for probability mapping
        class_names = target_encoder.classes_
        probabilities = {class_names[i]: float(probs_array[i]) for i in range(len(class_names))}

        return {
            "status": "success",
            "predicted_class": decoded_predicted_class,
            "probabilities": probabilities
        }

    except Exception as e:
        return {
            "status": "error",
            "message": f"An unexpected error occurred: {str(e)}"
        }

# Example usage:
# Ensure example data aligns with the original DataFrame's categories
example = {
    'StageID': "HighSchool",
    'GradeID': "G-12",
    'SectionID': "A",
    'Topic': "Math",
    'Semester': "F",
    'raisedhands': 70,
    'VisITedResources': 70,
    'Discussion': 30,
    'AnnouncementsView': 60,
    'StudentAbsenceDays': "Under-7"
}

print("\nPrediction Example:")
print(predict_student_performance(model_dict, example))


Prediction Example:
{'status': 'success', 'predicted_class': 'H', 'probabilities': {'H': 0.9281163811683655, 'L': 0.001644236035645008, 'M': 0.07023946195840836}}
