# 🎓 Capstone Project: Predicting Student Performance Using Machine Learning

---

**Submitted for:** CACIITG Summer Analytics 2025

**Objective:** Predict whether a student will pass or fail based on various demographic and academic features.

In [None]:
# 📦 Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
# 📥 Step 2: Load Dataset
df = pd.read_csv('StudentsPerformance.csv')
df.head()

##  Step 3: Exploratory Data Analysis (EDA)

In [None]:
# Check for missing values and data types
df.info()
df.isnull().sum()

In [None]:
# Visualize the distribution of scores
sns.histplot(df['math score'], kde=True)
plt.title('Distribution of Math Scores')
plt.show()

##  Step 4: Data Preprocessing
- Convert scores to binary classification: **Pass (>=40)** or **Fail (<40)**

In [None]:
# Create 'overall_score' and pass/fail label
df['average'] = (df['math score'] + df['reading score'] + df['writing score']) / 3
df['result'] = df['average'].apply(lambda x: 1 if x >= 40 else 0)

In [None]:
# Encode categorical columns
label_cols = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']
le = LabelEncoder()
for col in label_cols:
    df[col] = le.fit_transform(df[col])

In [None]:
# Select features and target
X = df.drop(['math score', 'reading score', 'writing score', 'average', 'result'], axis=1)
y = df['result']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

## Step 5: Model Building & Evaluation

In [None]:
# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

In [None]:
# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

In [None]:
# SVM
svm = SVC()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

##  Conclusion:
- **Random Forest** gave the best accuracy on this dataset.
- Features like **test preparation course** and **parental education** were significant.
- This model can be used to guide academic interventions.

---
**Thanks to CACIITG for this enriching learning journey!** 🎉