# 🧠 Student Risk Prediction + Tableau Export Notebook

In [None]:

# Student Risk Prediction + Tableau Export
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

# Load raw dataset
df = pd.read_csv("../data/exams.csv")

# Simulate attendance rate
np.random.seed(42)
df["attendance_rate"] = np.round(np.random.uniform(0.6, 1.0, len(df)), 2)

# Encode ordinal features
ordinal_cols = ["parental level of education", "lunch", "test preparation course"]
ord_map = OrdinalEncoder()
df[ordinal_cols] = ord_map.fit_transform(df[ordinal_cols])

# One-hot encode gender and race
gender_oh = pd.get_dummies(df["gender"], prefix="gender")
race_oh = pd.get_dummies(df["race/ethnicity"], prefix="race")
df = pd.concat([df.drop(["gender", "race/ethnicity"], axis=1), gender_oh, race_oh], axis=1)

# Define 'at_risk' label: any score < 60
df["at_risk"] = ((df["math score"] < 60) | 
                 (df["reading score"] < 60) | 
                 (df["writing score"] < 60)).astype(int)

# Save a copy for Tableau
df.to_csv("../tableau_dataset.csv", index=False)
print("✅ Exported tableau_dataset.csv")

# Optional: Train/test split for modeling (you can comment out if not needed)
X = df.drop("at_risk", axis=1)
y = df["at_risk"]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Scale and model
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = RandomForestClassifier()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"✅ Accuracy: {acc:.2f}, F1 Score: {f1:.2f}")
