# üîç Univariate Exploratory Data Analysis

**Author:** [Ton nom]

**Date:** January 5, 2026

**Objective:** Analyze individual variables (distributions, outliers, patterns).

In [None]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Load data
df = pd.read_csv('../data/StudentsPerformance.csv')
print(f'Dataset shape: {df.shape}')

## 1. Categorical Variables Analysis

In [None]:
# List categorical variables
categorical_cols = ['gender', 'race/ethnicity', 'parental level of education', 
                    'lunch', 'test preparation course']

print('=== CATEGORICAL VARIABLES ===')
for col in categorical_cols:
    print(f'\n{col.upper()}:')
    print(df[col].value_counts())
    print(f'Most common: {df[col].mode()[0]} ({df[col].value_counts().iloc[0]} students, {df[col].value_counts().iloc[0]/len(df)*100:.1f}%)')

In [None]:
# Visualize categorical distributions
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for i, col in enumerate(categorical_cols):
    counts = df[col].value_counts()
    axes[i].bar(range(len(counts)), counts.values, color='steelblue', alpha=0.7)
    axes[i].set_xticks(range(len(counts)))
    axes[i].set_xticklabels(counts.index, rotation=45, ha='right')
    axes[i].set_title(f'Distribution of {col}')
    axes[i].set_ylabel('Count')
    axes[i].grid(axis='y', alpha=0.3)
    
    # Add value labels
    for j, v in enumerate(counts.values):
        axes[i].text(j, v + 10, f'{v}\n({v/len(df)*100:.1f}%)', 
                    ha='center', va='bottom', fontsize=9)

fig.delaxes(axes[5])
plt.tight_layout()
plt.savefig('../reports/categorical_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

## 2. Numerical Variables (Scores) Analysis

In [None]:
# Score columns
score_cols = ['math score', 'reading score', 'writing score']

print('=== SCORE STATISTICS ===')
print(df[score_cols].describe())

print('\n=== DETAILED STATISTICS ===')
for col in score_cols:
    print(f'\n{col.upper()}:')
    print(f'  Mean: {df[col].mean():.2f}')
    print(f'  Median: {df[col].median():.2f}')
    print(f'  Std Dev: {df[col].std():.2f}')
    print(f'  Min: {df[col].min()}')
    print(f'  Max: {df[col].max()}')
    print(f'  Range: {df[col].max() - df[col].min()}')

In [None]:
# Score distributions with mean/median lines
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for i, col in enumerate(score_cols):
    axes[i].hist(df[col], bins=20, edgecolor='black', alpha=0.7, color='skyblue')
    axes[i].axvline(df[col].mean(), color='red', linestyle='--', linewidth=2,
                   label=f'Mean: {df[col].mean():.1f}')
    axes[i].axvline(df[col].median(), color='green', linestyle='--', linewidth=2,
                   label=f'Median: {df[col].median():.1f}')
    axes[i].set_title(f'{col.title()} Distribution', fontsize=14, fontweight='bold')
    axes[i].set_xlabel('Score', fontsize=12)
    axes[i].set_ylabel('Frequency', fontsize=12)
    axes[i].legend(fontsize=10)
    axes[i].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('../reports/score_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

## 3. Outlier Detection

In [None]:
# Boxplots for outlier visualization
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for i, col in enumerate(score_cols):
    bp = axes[i].boxplot(df[col], vert=True, patch_artist=True)
    bp['boxes'][0].set_facecolor('lightblue')
    axes[i].set_title(f'{col.title()} - Boxplot', fontsize=14, fontweight='bold')
    axes[i].set_ylabel('Score', fontsize=12)
    axes[i].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('../reports/score_boxplots.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# IQR method for outlier detection
print('=== OUTLIER DETECTION (IQR METHOD) ===')
for col in score_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    
    print(f'\n{col.upper()}:')
    print(f'  Q1: {Q1:.2f}')
    print(f'  Q3: {Q3:.2f}')
    print(f'  IQR: {IQR:.2f}')
    print(f'  Lower bound: {lower_bound:.2f}')
    print(f'  Upper bound: {upper_bound:.2f}')
    print(f'  Number of outliers: {len(outliers)} ({len(outliers)/len(df)*100:.2f}%)')

## üìå Key Findings

### Categorical Variables:
- **Gender**: Nearly balanced distribution
- **Race/Ethnicity**: 5 groups (A to E) with varying representation
- **Parental Education**: Ranges from 'some high school' to 'master's degree'
- **Lunch**: ~35% receive free/reduced lunch (socio-economic indicator)
- **Test Prep**: ~65% did NOT complete test preparation course

### Numerical Variables (Scores):
- All three scores follow **approximately normal distributions**
- **Mean scores**: Math ~66, Reading ~69, Writing ~68
- Math scores tend to be **slightly lower** than reading/writing
- **Minimal outliers** detected (< 2% for each score)
- Full range from 0 to 100 observed
- No missing values or data quality issues

### Implications for Modeling:
- Clean dataset, ready for modeling
- May need to encode categorical variables
- Consider investigating why math scores are lower
- Test prep completion could be a significant predictor