# Exploratory Data Analysis

This notebook explores the salary dataset, visualizes distributions, and checks for missing values.

In [2]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

ModuleNotFoundError: No module named 'pandas'

In [None]:
# Load CSV into PostgreSQL (run once)
from src.data_ingestion.database_operations import load_csv_to_postgres
load_csv_to_postgres("data/raw/Software_Salaries.csv")

In [None]:
# Ingest data from PostgreSQL
from src.data_ingestion.data_loader import DataLoader
from src.config.config import DatabaseConfig

loader = DataLoader(DatabaseConfig())
df = loader.load_data()
df.head()

In [None]:
# Summary statistics
df.describe(include='all')

In [None]:
# Missing values
df.isnull().sum()

In [None]:
# Distribution plot for total compensation
sns.histplot(df['total_compensation'].dropna(), kde=True)
plt.title("Distribution of Total Compensation")
plt.show()

In [None]:
# Automated EDA with Sweetviz
import sweetviz as sv
report = sv.analyze(df)
report.show_html("sweetviz_report.html")

In [None]:
# SHAP summary plot for model interpretability (example, after model training)
import shap
import xgboost as xgb

# Example: fit a simple model for demonstration
X = df.drop(columns=["total_compensation"])
y = df["total_compensation"]
model = xgb.XGBRegressor().fit(X, y)

explainer = shap.Explainer(model, X)
shap_values = explainer(X)
shap.summary_plot(shap_values, X)