# Exploratory Data Analysis (EDA)
Self-contained notebook: will auto-create `data.csv` if not found.

In [None]:
# =======================
# 1. Import Libraries
# =======================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

sns.set(style="whitegrid")

In [None]:
# =======================
# 2. Load Dataset (with fallback)
# =======================
if not os.path.exists("data.csv"):
    data = {
        "Name": ["Abeer", "Qadeer", "Umer", "Sehrish", "Khani", "Qadeer"],
        "Age": [25, 30, 28, 26, 35, None],
        "Salary": [50000, 60000, 55000, 65000, 70000, 60000],
        "Performance": [80, 85, 75, 90, 95, 88]
    }
    df = pd.DataFrame(data)
    df.to_csv("data.csv", index=False)
    print("Sample data.csv created!")

df = pd.read_csv("data.csv")
df.head()

In [None]:
# =======================
# 3. First Look at Data
# =======================
print("Shape of dataset:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nBasic Info:")
print(df.info())
print("\nSummary Statistics:")
print(df.describe(include="all"))

In [None]:
# =======================
# 4. Missing Values Check
# =======================
print("\nMissing Values:\n", df.isnull().sum())

In [None]:
# =======================
# 5. Univariate Analysis
# =======================
df.hist(figsize=(10, 8), bins=20, edgecolor="black")
plt.suptitle("Histograms of Numerical Features")
plt.show()

for col in df.select_dtypes(include=['object']).columns:
    plt.figure(figsize=(6,4))
    sns.countplot(data=df, x=col, palette="Set2")
    plt.title(f"Countplot of {col}")
    plt.xticks(rotation=45)
    plt.show()

In [None]:
# =======================
# 6. Bivariate Analysis
# =======================
plt.figure(figsize=(8,6))
sns.heatmap(df.select_dtypes(include=[np.number]).corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

if "Age" in df.columns and "Salary" in df.columns:
    plt.figure(figsize=(6,4))
    hue_col = "Performance" if "Performance" in df.columns else None
    sns.scatterplot(data=df, x="Age", y="Salary", hue=hue_col, palette="Set1")
    plt.title("Age vs Salary")
    plt.show()

for col in df.select_dtypes(include=[np.number]).columns:
    plt.figure(figsize=(6,4))
    sns.boxplot(data=df, y=col, palette="Set3")
    plt.title(f"Boxplot of {col}")
    plt.show()

In [None]:
# =======================
# 7. Pairplot
# =======================
num_cols = df.select_dtypes(include=[np.number]).columns
if len(num_cols) > 1:
    sns.pairplot(df[num_cols], diag_kind="kde")
    plt.suptitle("Pairplot of Numerical Variables", y=1.02)
    plt.show()
else:
    print("Not enough numerical columns for pairplot.")