# Final Project Team 4 – Student Performance Dataset

## 1. Data Cleaning
- Handle missing values
- Encode categorical variables
- Drop irrelevant columns

In [1]:
# --- Data Cleaning for Student Performance (UCI) ---

import pandas as pd
from pathlib import Path

# Folder convention: keep raw files under ./data/raw
DATA_DIR = Path("data")
RAW_DIR = DATA_DIR / "raw"
RAW_DIR.mkdir(parents=True, exist_ok=True)

# 1) Load raw CSVs (UCI files use semicolon separator)
mat = pd.read_csv(RAW_DIR / "student-mat.csv", sep=";")
por = pd.read_csv(RAW_DIR / "student-por.csv", sep=";")

# Optional: tag course to know origin
mat["course"] = "math"
por["course"] = "portuguese"

# 2) Save untouched copies (good practice)
mat.to_csv(DATA_DIR / "rawdata_student_mat.csv", index=False)
por.to_csv(DATA_DIR / "rawdata_student_por.csv", index=False)

# 3) Merge (stack) the two datasets
merged = pd.concat([mat, por], axis=0, ignore_index=True)

# 4) Basic type fixes
grade_cols = ["G1", "G2", "G3"]
for c in grade_cols:
    merged[c] = pd.to_numeric(merged[c], errors="coerce")

# 5) Handle missing values (simple policy: drop rows with NA in target or key predictors)
merged = merged.dropna(subset=["G3"])

# 6) Remove exact duplicate rows
merged = merged.drop_duplicates()

# 7) (Optional) Normalize yes/no & binary categories to 0/1 for modeling
yn_cols = ["schoolsup","famsup","paid","activities","nursery","higher","internet","romantic"]
for c in yn_cols:
    if c in merged.columns:
        merged[c] = merged[c].map({"yes":1, "no":0})

# 8) Save cleaned dataset for EDA & modeling
CLEAN_PATH = DATA_DIR / "student_clean.csv"
merged.to_csv(CLEAN_PATH, index=False)

print(f"Cleaned dataset saved to: {CLEAN_PATH.resolve()}")
print(merged.shape)
merged.head()


Cleaned dataset saved to: /Users/gisselletosta/Final-Project-Team-4-AAI500/data/student_clean.csv
(1044, 34)


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3,course
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,3,4,1,1,3,6,5,6,6,math
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,3,3,1,1,3,4,5,5,6,math
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,3,2,2,3,3,10,7,8,10,math
3,GP,F,15,U,GT3,T,4,2,health,services,...,2,2,1,1,5,2,15,14,15,math
4,GP,F,16,U,GT3,T,3,3,other,other,...,3,2,1,2,5,4,6,10,10,math


count    1044.000000
mean       11.341954
std         3.864796
min         0.000000
25%        10.000000
50%        11.000000
75%        14.000000
max        20.000000
Name: G3, dtype: float64

## 2. Exploratory Data Analysis (EDA)
- Summary statistics
- Visualizations (histograms, boxplots, scatterplots, heatmaps)

## 3. Models
- Baseline model
- Candidate models (logistic regression, random forest, SVM, etc.)
- Hyperparameter tuning

## 4. Results & Analysis
- Model performance metrics
- Comparisons
- Interpretation

## 5. Conclusion & Next Steps
- Key takeaways
- Limitations
- Recommendations