# 04 — EDA & Distributions
**Data Analysis Portfolio**

Topics: univariate analysis, histogram+KDE, boxplot, bivariate, correlation heatmap, skewness, normality test

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
np.random.seed(42)
sns.set_theme(style='whitegrid', palette='muted')
print("Ready.")

## 1. Dataset

In [None]:
n = 500
df = pd.DataFrame({
    'age':         np.random.normal(35,8,n).clip(18,65).round().astype(int),
    'salary':      np.random.lognormal(10.8,0.4,n).round(0).clip(25000,200000),
    'experience':  np.random.randint(0,30,n),
    'department':  np.random.choice(['IT','HR','Finance','Marketing','Operations'],n,p=[.30,.15,.20,.20,.15]),
    'gender':      np.random.choice(['Male','Female'],n,p=[.55,.45]),
    'rating':      np.random.choice([1,2,3,4,5],n,p=[.05,.10,.25,.40,.20]),
    'education':   np.random.choice(["Bachelor's","Master's","PhD"],n,p=[.55,.35,.10]),
})
print(df.describe().round(1))

## 2. Univariate — Numerical

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 9))
fig.suptitle('Univariate Analysis — Numerical', fontsize=14, fontweight='bold')
num_cols = ['age','salary','experience']
for i, col in enumerate(num_cols):
    axes[0,i].hist(df[col], bins=30, color='steelblue', edgecolor='white', density=True, alpha=0.7)
    df[col].plot.kde(ax=axes[0,i], color='red', linewidth=2)
    axes[0,i].set_title(f'{col} — Histogram + KDE')
    axes[1,i].boxplot(df[col], vert=False, patch_artist=True, boxprops=dict(facecolor='lightblue'))
    axes[1,i].set_title(f'{col} — Boxplot')
plt.tight_layout()
plt.savefig('/home/claude/data_analysis_portfolio/notebooks/04_univariate_num.png', dpi=100)
plt.show()

In [None]:
print("Skewness & Kurtosis:")
for col in ['age','salary','experience']:
    sk = df[col].skew()
    ku = df[col].kurtosis()
    label = "Right-skewed" if sk>0.5 else ("Left-skewed" if sk<-0.5 else "Symmetric")
    print(f"  {col:12}: skew={sk:+.3f} ({label}), kurt={ku:+.3f}")

## 3. Univariate — Categorical

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(14, 5))
fig.suptitle('Univariate Analysis — Categorical', fontsize=14, fontweight='bold')
for i, (col, clr) in enumerate(zip(['department','gender','education'],['coral','steelblue','mediumseagreen'])):
    counts = df[col].value_counts()
    axes[i].bar(counts.index, counts.values, color=clr, edgecolor='white')
    axes[i].set_title(col)
    for j,v in enumerate(counts.values):
        axes[i].text(j, v+3, str(v), ha='center', fontsize=9)
plt.tight_layout()
plt.savefig('/home/claude/data_analysis_portfolio/notebooks/04_univariate_cat.png', dpi=100)
plt.show()

## 4. Bivariate Analysis

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(13, 10))
fig.suptitle('Bivariate Analysis', fontsize=14, fontweight='bold')

# Salary by dept — violin
dept_order = df.groupby('department')['salary'].median().sort_values(ascending=False).index.tolist()
for j, dept in enumerate(dept_order):
    axes[0,0].boxplot(df[df['department']==dept]['salary'], positions=[j], widths=0.6,
                      patch_artist=True, boxprops=dict(facecolor='lightcoral'))
axes[0,0].set_xticks(range(len(dept_order)))
axes[0,0].set_xticklabels(dept_order, rotation=20)
axes[0,0].set_title('Salary by Department')

# Salary by gender
for gender, color in zip(['Male','Female'],['steelblue','salmon']):
    axes[0,1].hist(df[df['gender']==gender]['salary'], bins=25, alpha=0.6, label=gender, color=color, density=True)
axes[0,1].set_title('Salary by Gender')
axes[0,1].legend()

# Experience vs Salary scatter
axes[1,0].scatter(df['experience'], df['salary'], alpha=0.3, color='teal', s=15)
m,b,r,p,_ = stats.linregress(df['experience'], df['salary'])
x = np.linspace(0,30,100)
axes[1,0].plot(x, m*x+b, 'red', linewidth=2, label=f'r={r:.2f}')
axes[1,0].set_title('Experience vs Salary')
axes[1,0].legend()

# Rating by Education
re = df.groupby(['education','rating']).size().unstack(fill_value=0)
re.plot(kind='bar', ax=axes[1,1], colormap='Set2', edgecolor='white')
axes[1,1].set_title('Rating by Education')
axes[1,1].tick_params(axis='x', rotation=15)

plt.tight_layout()
plt.savefig('/home/claude/data_analysis_portfolio/notebooks/04_bivariate.png', dpi=100)
plt.show()

## 5. Correlation Heatmap

In [None]:
corr = df[['age','salary','experience','rating']].corr()
plt.figure(figsize=(7,5))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', linewidths=0.5, square=True, vmin=-1, vmax=1)
plt.title('Correlation Matrix', fontsize=13, fontweight='bold')
plt.tight_layout()
plt.savefig('/home/claude/data_analysis_portfolio/notebooks/04_correlation.png', dpi=100)
plt.show()

## 6. Normality Test

In [None]:
from scipy.stats import shapiro
print("Shapiro-Wilk Normality Test:")
for col in ['age','salary','experience']:
    stat, p = shapiro(df[col].sample(200, random_state=42))
    print(f"  {col:12}: W={stat:.4f}, p={p:.4f} → {'Normal' if p>0.05 else 'Not Normal'}")