<a href="https://colab.research.google.com/github/ElayatNisrine/Front-End-Checklist/blob/master/student%20performance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas numpy scikit-learn openpyxl tensorflow matplotlib seaborn

In [None]:
!pip install --upgrade scikit-learn

In [54]:
import pandas as pd
import numpy as np
import tensorflow as tf
import time
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input, MultiHeadAttention, LayerNormalization
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
import shap
from tensorflow.keras.models import load_model




In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Load dataset from Google Drive
import pandas as pd
df = pd.read_csv("/content/drive/My Drive/Datasets/final dataset.csv")
print(df["Course Progress"].dtype)
print(df.head(10))

In [None]:
# 📌 Convert time-based features (HH:MM:SS) into total minutes
def convert_time_to_minutes(time_str):
    if isinstance(time_str, str):
        h, m, s = map(int, time_str.split(':'))
        return h * 60 + m + s / 60
    return 0  # Handle missing values

df["Course Time Spent"] = df["Course Time Spent"].apply(convert_time_to_minutes)
#df["Lesson Time Spent"] = df["Lesson Time Spent"].apply(convert_time_to_minutes)

# 📌 Convert Progress from % to Numeric (remove % and convert to float)
df["Course Progress"] = df["Course Progress"].str.replace("%", "").astype(float) / 100
#df["Lesson Progress"] = df["Lesson Progress"].str.replace("%", "").astype(float) / 100  # Scale to 0-1

# 📌 Convert test scores (e.g., "122/400") into numeric values
def extract_score(score_str):
    if isinstance(score_str, str):
        return int(score_str.split('/')[0])  # Extract only the actual score
    return 0  # Handle missing values

df["First Test Score"] = df["First Test Score"].apply(extract_score)
df["Last Test Score"] = df["Last Test Score"].apply(extract_score)
 # Adjusted for consistency
print(df.head(10))

In [51]:

# 📌 Count the number of activities per student-language pair
df["Activity Count"] = 1  # Each row represents an activity

# 📌 Aggregate data per student-language (NOT per course)
student_df = df.groupby(["Email", "Language of Study"]).agg({
    "Course Time Spent": "sum",  # Total learning time in minutes
    "Course Progress": "mean",   # Average progress across courses
    "First Test Score": "first", # Keep the first test score (Initial Level)
    "Last Test Score": "first", # Keep the final test score (Target Variable)
    "Activity Count": "count"    # Count total activities per student-language
}).reset_index()
# Count number of rows per student-language before aggregation
activity_check = df.groupby(["Email", "Language of Study"]).size().reset_index(name="Actual Activity Count")

# Display the first few rows
print(activity_check.head(10))
# 📌 Encode "Language of Study" (Convert categorical values to numeric)
encoder = LabelEncoder()
student_df["Language of Study"] = encoder.fit_transform(student_df["Language of Study"])

# 📌 Select features for deep learning
features = ["Language of Study", "Course Time Spent", "Course Progress", "First Test Score"]
target = ["Last Test Score"]

# 📌 Normalize data
scaler = MinMaxScaler()
student_df[features] = scaler.fit_transform(student_df[features])
student_df[target] = scaler.fit_transform(student_df[target])

# 📌 Reshape for LSTM (samples, time steps, features)
X = student_df[features].values.reshape(student_df.shape[0], 1, len(features))
y = student_df[target].values

# 📌 Split dataset into train (80%) and test (20%)
split = int(0.8 * len(student_df))
X_train, X_test, y_train, y_test = X[:split], X[split:], y[:split], y[split:]


print(student_df.head(10))


                                 Email   Language of Study  \
0          abbadi.maroua@etu.uae.ac.ma  English (American)   
1          abbadi.maroua@etu.uae.ac.ma              French   
2    abdallahzayd.drissi@etu.uae.ac.ma   English (British)   
3    abdallahzayd.drissi@etu.uae.ac.ma              French   
4  abdelali.hourmatallah@etu.uae.ac.ma  English (American)   
5  abdelali.hourmatallah@etu.uae.ac.ma              French   
6     abdelhakim.aourfat@etu.uae.ac.ma  English (American)   
7     abdelhakim.aourfat@etu.uae.ac.ma              French   
8    abdeljalil.serrakhi@etu.uae.ac.ma  English (American)   
9    abdeljalil.serrakhi@etu.uae.ac.ma              French   

   Actual Activity Count  
0                     50  
1                     17  
2                     25  
3                      1  
4                     65  
5                     68  
6                     45  
7                     14  
8                     97  
9                     36  
                    

In [55]:
# 📌 Define LSTM model
lstm_model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(1, len(features))),
    LSTM(64, return_sequences=False),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1)  # Output: Final Test Score Prediction
])

lstm_model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# 📌 Train LSTM model and record time
start_time = time.time()
lstm_model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=0)
# Save the trained model (optional)
lstm_model.save("trained_lstm_model.h5")
lstm_time = time.time() - start_time

# 📌 Predict using LSTM
y_pred_lstm = lstm_model.predict(X_test)

# 📌 Convert predictions back to original scale
y_pred_lstm = scaler.inverse_transform(y_pred_lstm)
y_actual = scaler.inverse_transform(y_test)

# 📌 Evaluate LSTM model
lstm_mae = mean_absolute_error(y_actual, y_pred_lstm)
lstm_mse = mean_squared_error(y_actual, y_pred_lstm)  # This returns MSE
lstm_rmse = np.sqrt(lstm_mse)
print(f"LSTM Model - MAE: {lstm_mae:.2f}, RMSE: {lstm_rmse:.2f}, Training Time: {lstm_time:.2f} seconds")


# 📌 Disable eager execution to avoid SHAP gradient errors
tf.compat.v1.disable_eager_execution()

# 📌 Load the trained model
lstm_model = load_model("trained_lstm_model.h5")

# 📌 Create SHAP DeepExplainer
explainer = shap.DeepExplainer(lstm_model, X_train)

# 📌 Compute SHAP values
shap_values = explainer.shap_values(X_test)

# 📌 Plot SHAP summary
shap.summary_plot(shap_values, X_test)

LookupError: gradient registry has no entry for: shap_DivNoNan