# Data Exploration and Analysis

This notebook contains initial data exploration and analysis steps.

In [None]:
import sys
from pathlib import Path

# Add project root to path
project_root = Path.cwd().parent
sys.path.append(str(project_root))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.data_processing.data_loader import load_data
from config import DATA_DIR, TARGET_COLUMN

## 1. Load and Inspect Data

In [None]:
# TODO: Replace with your data file
data_path = DATA_DIR / "your_data.csv"
df = load_data(data_path)

print("Dataset shape:", df.shape)
df.head()

## 2. Data Quality Check

In [None]:
# Basic information about the dataset
df.info()

# Missing values
print("\nMissing values:")
print(df.isnull().sum())

# Basic statistics
print("\nNumerical columns statistics:")
print(df.describe())

## 3. Feature Analysis

In [None]:
# TODO: Add visualizations for important features
# Example: Distribution of numerical features
numerical_cols = df.select_dtypes(include=[np.number]).columns
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
axes = axes.ravel()

for idx, col in enumerate(numerical_cols[:4]):
    sns.histplot(data=df, x=col, ax=axes[idx])
    
plt.tight_layout()
plt.show()

## 4. Target Variable Analysis

In [None]:
# TODO: Analyze target variable distribution and relationships
if TARGET_COLUMN in df.columns:
    print("Target variable distribution:")
    print(df[TARGET_COLUMN].value_counts(normalize=True))
    
    # Visualize target distribution
    plt.figure(figsize=(8, 6))
    sns.countplot(data=df, x=TARGET_COLUMN)
    plt.title("Target Variable Distribution")
    plt.show()

## 5. Feature Relationships

In [None]:
# TODO: Analyze relationships between features
# Example: Correlation matrix for numerical features
correlation_matrix = df[numerical_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title("Feature Correlations")
plt.show()