In [None]:
import pandas as pd

url = "https://raw.githubusercontent.com/Aniket-bit7/learning-analytics-ml-system/main/data/students_data.csv"

df = pd.read_csv(url)

df.head()

In [None]:
df["Final_Result"].unique()

In [None]:
print("Shape:", df.shape)

In [None]:
df.info()

In [None]:
# dropping Student_ID from modeling as it has no predictive meaning and it may introduce noise
df = df.drop(columns=["Student_ID"])

In [None]:
# checking missing values -->
df.isnull().sum()

In [None]:
# checking duplicates -->
df.duplicated().sum()

In [None]:
df.describe()

In [None]:
# visualization part -->
import matplotlib.pyplot as plt
import seaborn as sns

df.hist(figsize=(12, 8))
plt.tight_layout()
plt.show()

In [None]:
sns.countplot(x="Final_Result", data=df)
plt.title("Final Result Distribution")
plt.show()

df["Final_Result"].value_counts()

In [None]:
sns.boxplot(x="Final_Result", y="Attendance", data=df)
plt.show()

# Handling Missing Values


In [None]:
# before handling missing values -->
print(df.isnull().sum())

df["Quiz2"].fillna(df["Quiz2"].mean(), inplace=True)
df["Time_Spent"].fillna(df["Time_Spent"].mean(), inplace=True)
df["Attendance"].fillna(df["Attendance"].mean(), inplace=True)

# after handling missing values -->
print(df.isnull().sum())

# Outlier Handling

In [None]:
Q1 = df["Time_Spent"].quantile(0.25)
Q3 = df["Time_Spent"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df["Time_Spent"] = df["Time_Spent"].clip(lower_bound, upper_bound)

In [None]:
sns.boxplot(x=df["Time_Spent"])
plt.title("Time_Spent After Outlier Handling")
plt.show()

# Encoding and visualization

In [None]:
# Encode the 'Final_Result' column to numerical values
df['Final_Result'] = df['Final_Result'].map({'Pass': 1, 'Fail': 0})

# correlation heatmap -->
plt.figure(figsize=(8, 6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap="coolwarm")
plt.show()

# Feature Engineering

In [None]:
#Total Quiz Score -->
df["Total_Quiz_Score"] = df["Quiz1"] + df["Quiz2"] + df["Quiz3"]

In [None]:
#Average Quiz Score -->
df["Average_Quiz_Score"] = df["Total_Quiz_Score"] / 3

In [None]:
# Quiz_Consistency -->
df["Quiz_Std"] = df[["Quiz1", "Quiz2", "Quiz3"]].std(axis=1)

In [None]:
#Average Quiz Score -->
df["Average_Quiz_Score"] = df["Total_Quiz_Score"] / 3

In [None]:
# Quiz_Consistency -->
df["Quiz_Std"] = df[["Quiz1", "Quiz2", "Quiz3"]].std(axis=1)

In [None]:
# Engagement_Index -->
df["Engagement_Index"] = (
    df["Time_Spent"] * 0.5 +
    df["Assignments"] * 0.3 +
    df["Attendance"] * 0.2
)

In [None]:
df

In [None]:
# Performance Ratio -->
df["Quiz_Percentage"] = (df["Total_Quiz_Score"] / 300) * 100

In [None]:
# Effort_Performance_Ratio -->
df["Effort_Performance_Ratio"] = (
    df["Total_Quiz_Score"] / (df["Time_Spent"] + 1)
)

# Split Dataset

In [None]:
X = df.drop(columns=["Final_Result"])
y = df["Final_Result"]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [None]:
print("Original distribution:\n", y.value_counts(normalize=True))
print("\nTraining distribution:\n", y_train.value_counts(normalize=True))
print("\nTest distribution:\n", y_test.value_counts(normalize=True))

# Scale Features

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()


In [None]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# MODEL TRAINING


In [None]:
# using Logistic regression for training model
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_scaled, y_train)

In [None]:
# Predictions -->
y_pred = log_reg.predict(X_test_scaled)
y_prob = log_reg.predict_proba(X_test_scaled)[:, 1]

In [None]:
# Evaluation Metrics -->
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    roc_auc_score
)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_prob))

In [None]:
# Handling Class Imbalance Properly -->
log_reg_balanced = LogisticRegression(
    class_weight="balanced",
    random_state=42
)

log_reg_balanced.fit(X_train_scaled, y_train)

In [None]:
# again evaluating -->
y_pred_bal = log_reg_balanced.predict(X_test_scaled)
y_prob_bal = log_reg_balanced.predict_proba(X_test_scaled)[:, 1]

print("Confusion Matrix (Balanced):\n", confusion_matrix(y_test, y_pred_bal))
print("\nClassification Report (Balanced):\n", classification_report(y_test, y_pred_bal))
print("ROC-AUC Score (Balanced):", roc_auc_score(y_test, y_prob_bal))

In [None]:
final_model = log_reg_balanced