# Week 3 – Exploratory Data Analysis (EDA)
Dataset: **Student Performance (messy custom CSV)**

This notebook demonstrates data **loading**, **cleaning**, and **EDA** with visualizations.
It also shows basic **NumPy**-based stats like variance and correlation.


In [None]:

# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

# Paths
BASE = Path("." )
DATA = BASE / "data" / "students_raw.csv"
PLOTS = BASE / "plots"
PLOTS.mkdir(parents=True, exist_ok=True)

print("Data path:", DATA.resolve())


In [None]:

# Load dataset
df = pd.read_csv(DATA)
print("Shape:", df.shape)
df.head()


In [None]:

# Basic info
print("\nColumns & dtypes:\n", df.dtypes)
print("\nMissing values per column:\n", df.isna().sum())


## Cleaning
- Standardize categorical values (Gender, City, Passed)
- Remove duplicates
- Fill missing values (median for numeric, mode for categorical)
- Derive `AverageScore` and fix missing `Passed`

In [None]:

def standardize_categories(df):
    df = df.copy()
    def std_str(x):
        if pd.isna(x): return x
        return str(x).strip()

    if "Gender" in df.columns:
        df["Gender"] = df["Gender"].map(std_str)
        gender_map = {"M":"Male","MALE":"Male","MALE ":"Male","male":"Male","Male":"Male",
                      "F":"Female","female":"Female"," Female":"Female","Female":"Female"}
        df["Gender"] = df["Gender"].map(lambda x: gender_map.get(x, x))

    if "City" in df.columns:
        df["City"] = df["City"].map(std_str).str.title().replace({"Bangalore":"Bengaluru"})

    if "Passed" in df.columns:
        df["Passed"] = df["Passed"].map(std_str).str.title()
        passed_map = {"Y":"Yes","Yes":"Yes","No":"No","N":"No"}
        df["Passed"] = df["Passed"].map(lambda x: passed_map.get(x, x))
    return df

def fill_missing(df):
    df = df.copy()
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = [c for c in df.columns if c not in num_cols]

    for c in num_cols:
        if df[c].isna().any():
            df[c] = df[c].fillna(df[c].median())

    for c in cat_cols:
        if df[c].isna().any():
            mode_val = df[c].mode(dropna=True)
            if len(mode_val) > 0:
                df[c] = df[c].fillna(mode_val.iloc[0])
            else:
                df[c] = df[c].fillna("Unknown")
    return df

def derive_features(df):
    df = df.copy()
    sco_cols = [c for c in ["MathScore","ReadingScore","WritingScore"] if c in df.columns]
    if sco_cols:
        df["AverageScore"] = df[sco_cols].mean(axis=1)
    if "Passed" in df.columns and df["Passed"].isna().any():
        df.loc[df["Passed"].isna(), "Passed"] = np.where(df["AverageScore"] >= 40, "Yes", "No")
    return df

# Apply cleaning
df_clean = (df
            .drop_duplicates()
            .pipe(standardize_categories)
            .pipe(fill_missing)
            .pipe(derive_features)
           )

print("Cleaned shape:", df_clean.shape)
df_clean.head()


In [None]:

# Save cleaned dataset
df_clean.to_csv("data/students_clean.csv", index=False)
print("Saved to data/students_clean.csv")


## Exploratory Analysis

In [None]:

# Summary statistics for numerical columns
df_clean.describe()


In [None]:

# Most common values for categorical columns
num_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in df_clean.columns if c not in num_cols]

for c in cat_cols:
    print(f"\nColumn: {c}")
    print(df_clean[c].value_counts().head(5))


In [None]:

# NumPy calculations: variance and correlation
print("Variance (ddof=1):")
print(df_clean.select_dtypes(include=[np.number]).var(ddof=1))

print("\nCorrelation matrix:")
print(df_clean.select_dtypes(include=[np.number]).corr())


## Visualizations

In [None]:

# Histogram of Age
plt.figure()
df_clean["Age"].plot(kind="hist", bins=10, edgecolor="black")
plt.title("Histogram of Age")
plt.xlabel("Age"); plt.ylabel("Frequency")
plt.savefig("plots/hist_age.png", bbox_inches="tight")
plt.show()


In [None]:

# Bar chart of Gender
plt.figure()
df_clean["Gender"].value_counts().plot(kind="bar")
plt.title("Gender Distribution")
plt.xlabel("Gender"); plt.ylabel("Count")
plt.savefig("plots/bar_gender.png", bbox_inches="tight")
plt.show()


In [None]:

# Scatter: StudyHours vs AverageScore
plt.figure()
plt.scatter(df_clean["StudyHours"], df_clean["AverageScore"])
plt.title("Study Hours vs Average Score")
plt.xlabel("Study Hours"); plt.ylabel("Average Score")
plt.savefig("plots/scatter_study_vs_avg.png", bbox_inches="tight")
plt.show()
