<a href="https://colab.research.google.com/github/2303A51621/2303A51621-batch-22/blob/main/shashank%20code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# edu_inequality_pipeline.py
# Comprehensive pipeline for “Educational Inequality” analysis + ML
# - EDA: correlation heatmap, funding distribution by school_type
# - ML: RandomForest to predict dropout risk (low/medium/high)
# - Outputs: fig1.png, fig2.png, fig3.png + metrics/report CSVs

import os
import zipfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report
)
from sklearn.ensemble import RandomForestClassifier

# -------------------------------------------------------
# 0) Paths (edit if running locally)
# -------------------------------------------------------
ZIP_PATH = "/content/archive (1).zip"  # change or skip if already extracted
EXTRACT_DIR = "extracted_dataset"
DATA_PATH = os.path.join(EXTRACT_DIR, "education_inequality_data.csv")

FIG1 = "fig1.png"  # correlation heatmap
FIG2 = "fig2.png"  # funding distribution by school type (boxplot)
FIG3 = "fig3.png"  # model performance bar chart

REPORT_TXT = "classification_report.txt"
CM_CSV = "confusion_matrix.csv"
FEATIMP_CSV = "feature_importances.csv"

# -------------------------------------------------------
# 1) Extract (if needed) and load dataset
# -------------------------------------------------------
if not os.path.exists(DATA_PATH):
    if os.path.exists(ZIP_PATH):
        os.makedirs(EXTRACT_DIR, exist_ok=True)
        with zipfile.ZipFile(ZIP_PATH, "r") as z:
            z.extractall(EXTRACT_DIR)
    else:
        raise FileNotFoundError(
            f"Dataset not found at {DATA_PATH} and ZIP not present at {ZIP_PATH}."
        )

df = pd.read_csv(DATA_PATH)

# -------------------------------------------------------
# 2) EDA: Correlation heatmap of numeric features
# -------------------------------------------------------
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
corr = df[numeric_cols].corr()

plt.figure(figsize=(10, 6))
plt.imshow(corr, interpolation="nearest")  # no explicit colormap per rules
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.columns)), corr.columns)
plt.title("Correlation Heatmap of Numeric Features")
plt.colorbar(label="Correlation")
plt.tight_layout()
plt.savefig(FIG1, dpi=300, bbox_inches="tight")
plt.close()

# -------------------------------------------------------
# 3) EDA: Funding distribution by school type (boxplot)
# -------------------------------------------------------
if "school_type" in df.columns and "funding_per_student_usd" in df.columns:
    grouped = [
        df.loc[df["school_type"] == cat, "funding_per_student_usd"].dropna().values
        for cat in df["school_type"].dropna().unique()
    ]
    labels = df["school_type"].dropna().unique().tolist()

    plt.figure(figsize=(8, 5))
    plt.boxplot(grouped, labels=labels, showmeans=True)
    plt.title("Funding per Student across School Types")
    plt.ylabel("Funding per Student (USD)")
    plt.xlabel("School Type")
    plt.tight_layout()
    plt.savefig(FIG2, dpi=300, bbox_inches="tight")
    plt.close()

# -------------------------------------------------------
# 4) Create target labels: dropout risk categories (quantile-based)
# -------------------------------------------------------
drop_col = "dropout_rate_percent"
if drop_col not in df.columns:
    raise KeyError("Expected column 'dropout_rate_percent' not found in dataset.")

df = df.dropna(subset=[drop_col]).copy()
df["dropout_risk_cat"] = pd.qcut(df[drop_col], q=3, labels=["low", "medium", "high"])

# -------------------------------------------------------
# 5) Feature engineering: numeric + categorical
# -------------------------------------------------------
exclude_cols = ["id", "school_name", "state", "dropout_rate_percent", "dropout_risk_cat"]
feature_cols = [c for c in df.columns if c not in exclude_cols]

X = df[feature_cols].copy()
y = df["dropout_risk_cat"].copy()

numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(exclude=[np.number]).columns.tolist()

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

# -------------------------------------------------------
# 6) Split data
# -------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# -------------------------------------------------------
# 7) Model: Random Forest
# -------------------------------------------------------
rf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    n_jobs=-1
)

pipe = Pipeline(steps=[("prep", preprocess), ("rf", rf)])
pipe.fit(X_train, y_train)

# -------------------------------------------------------
# 8) Evaluation
# -------------------------------------------------------
y_pred = pipe.predict(X_test)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average="macro", zero_division=0)
rec = recall_score(y_test, y_pred, average="macro", zero_division=0)
f1 = f1_score(y_test, y_pred, average="macro", zero_division=0)

print("Accuracy:", acc)
print("Precision (macro):", prec)
print("Recall (macro):", rec)
print("F1 (macro):", f1)
print()
print(classification_report(y_test, y_pred, zero_division=0))

# Save classification report
with open(REPORT_TXT, "w") as f:
    f.write(classification_report(y_test, y_pred, zero_division=0))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=["low", "medium", "high"])
cm_df = pd.DataFrame(
    cm,
    index=["true_low", "true_medium", "true_high"],
    columns=["pred_low", "pred_medium", "pred_high"]
)
cm_df.to_csv(CM_CSV, index=True)

# -------------------------------------------------------
# 9) Plot metrics bar chart (fig3)
# -------------------------------------------------------
metrics = {
    "accuracy": acc,
    "precision_macro": prec,
    "recall_macro": rec,
    "f1_macro": f1
}
plt.figure(figsize=(6, 4))
plt.bar(list(metrics.keys()), list(metrics.values()))
plt.ylabel("Score")
plt.title("Random Forest Performance (Test Set)")
plt.xticks(rotation=30)
plt.tight_layout()
plt.savefig(FIG3, dpi=300, bbox_inches="tight")
plt.close()

# -------------------------------------------------------
# 10) Feature importances export (CSV)
# -------------------------------------------------------
# Retrieve generated feature names from preprocessors
ohe = pipe.named_steps["prep"].named_transformers_["cat"]
cat_names = list(ohe.get_feature_names_out(categorical_features)) if categorical_features else []
all_feature_names = numeric_features + cat_names

importances = pipe.named_steps["rf"].feature_importances_
feat_imp_df = pd.DataFrame({"feature": all_feature_names, "importance": importances})
feat_imp_df = feat_imp_df.sort_values("importance", ascending=False)
feat_imp_df.to_csv(FEATIMP_CSV, index=False)

print("\nSaved:")
print("-", FIG1)
print("-", FIG2)
print("-", FIG3)
print("-", REPORT_TXT)
print("-", CM_CSV)
print("-", FEATIMP_CSV)


  plt.boxplot(grouped, labels=labels, showmeans=True)


Accuracy: 0.295
Precision (macro): 0.29515745692216283
Recall (macro): 0.2950399517563696
F1 (macro): 0.2950878880703442

              precision    recall  f1-score   support

        high       0.28      0.28      0.28        67
         low       0.30      0.30      0.30        67
      medium       0.30      0.30      0.30        66

    accuracy                           0.29       200
   macro avg       0.30      0.30      0.30       200
weighted avg       0.30      0.29      0.30       200


Saved:
- fig1.png
- fig2.png
- fig3.png
- classification_report.txt
- confusion_matrix.csv
- feature_importances.csv
