In [None]:
# Imports and settings
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

sns.set(style="whitegrid")
%matplotlib inline

# Student Exam Prediction
This notebook trains a **Random Forest** classifier to predict the exam `result` (0 = fail, 1 = pass) using `hours_studied` and `subject`.
Steps: load data, inspect & clean, encode categorical features, split into train/test, train the model, and evaluate performance with metrics and plots.

In [None]:
# Load dataset (robust path) and show a sample
DATA_PATH = Path(__file__).parent / "dataset" / "students.csv" if '__file__' in globals() else Path("dataset") / "students.csv"
print("Loading:", DATA_PATH)
df = pd.read_csv(DATA_PATH)
print("Dataset shape:", df.shape)
df.head()

Dataset shape: (58, 4)
   student_name  hours_studied  result subject
0   Emma Wilson              1       0    Math
1  Noah Johnson              2       0    Math
2  Olivia Brown              3       0    Math
3    Liam Smith              4       0    Math
4     Ava Jones              5       1    Math


In [None]:
# Quick checks and basic EDA
print("Missing values:\n", df.isnull().sum())
print("Duplicates:", df.duplicated().sum())

# If duplicates exist, drop them
if df.duplicated().any():
    df = df.drop_duplicates().reset_index(drop=True)

print('\nSummary statistics:')
print(df.describe())

# Subject distribution plot
plt.figure(figsize=(8,4))
sns.countplot(data=df, x='subject', order=df['subject'].value_counts().index)
plt.title('Count by subject')
plt.show()

No Null and duplicates found


Unnamed: 0,hours_studied,result
count,58.0,58.0
mean,5.913793,0.62069
std,2.780158,0.489453
min,1.0,0.0
25%,4.0,0.0
50%,6.0,1.0
75%,8.0,1.0
max,12.0,1.0


In [None]:
# Preprocessing: encode categorical variables and prepare X, y
X = df[['hours_studied', 'subject']].copy()
X = pd.get_dummies(X, columns=['subject'], drop_first=True)  # avoid multicollinearity
y = df['result']

print("Feature columns:\n", X.columns.tolist())
X.head()

subject
Math         15
Physics       9
Chemistry     7
Biology       7
History       7
Geography     7
English       6
Name: count, dtype: int64

In [None]:
# Train/test split (stratify to keep class balance)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train Random Forest and print cross-validated score
clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)
cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
print(f"CV accuracy (5-fold) mean: {cv_scores.mean():.3f} Â± {cv_scores.std():.3f}")

[ 0  5 34 13 44 52 37 25 46 12 48  3 36 31  8 17  6  4 41 19 30 49 50 54
 15  9 27 26 16 24 33 55 40 11 32 56 43 29 53  1 21  2 45 39 35 23 47 10
 22 18 57 20  7 42 14 28 51 38]


In [None]:
# Evaluation on test set
y_pred = clf.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Confusion matrix heatmap
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=[0,1], yticklabels=[0,1])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Feature importances
feat_imp = pd.Series(clf.feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(6,4))
sns.barplot(x=feat_imp.values, y=feat_imp.index)
plt.title('Feature importances')
plt.show()

In [None]:
# Save trained model (optional)
import joblib
model_path = Path("models")
model_path.mkdir(exist_ok=True)
joblib.dump(clf, model_path / "student_exam_rf.pkl")
print("Saved model to", model_path / "student_exam_rf.pkl")

student_name     Ella Walker
hours_studied              4
result                     0
subject            Geography
Name: 23, dtype: object