# Data Analysis and Prediction Notebook

This notebook demonstrates the capabilities of our data analysis and prediction project.

## Table of Contents
1. [Data Loading and Generation](#data-loading)
2. [Exploratory Data Analysis](#eda)
3. [Data Visualization](#visualization)
4. [Machine Learning Predictions](#ml)
5. [Results and Insights](#results)

## 1. Data Loading and Generation {#data-loading}

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Import our custom modules
import sys
sys.path.append('../src')

from analysis.data_analyzer import DataAnalyzer
from models.predictor import Predictor
from visualization.plotter import Plotter
from utils.data_generator import generate_sample_data

print("✅ All libraries imported successfully!")

In [None]:
# Generate sample data
data = generate_sample_data('../data/sample_data.csv', n_samples=1000)
print(f"Dataset shape: {data.shape}")
data.head()

## 2. Exploratory Data Analysis {#eda}

In [None]:
# Initialize data analyzer
analyzer = DataAnalyzer(data)

# Get basic statistics
stats = analyzer.basic_stats()

In [None]:
# Correlation analysis
corr_matrix = analyzer.correlation_analysis()

In [None]:
# Data quality report
quality_report = analyzer.data_quality_report()

## 3. Data Visualization {#visualization}

In [None]:
# Initialize plotter
plotter = Plotter()

# Plot income distribution
plotter.plot_distribution(data, 'income')

In [None]:
# Correlation heatmap
plotter.plot_correlation_heatmap(data)

In [None]:
# Scatter plot: age vs income
plotter.plot_scatter(data, 'age', 'income', color_col='department')

In [None]:
# Multiple distributions
numeric_cols = ['age', 'income', 'education_years', 'satisfaction_score']
plotter.plot_multiple_distributions(data, numeric_cols)

## 4. Machine Learning Predictions {#ml}

In [None]:
# Initialize predictor
predictor = Predictor()

# Define features and target for regression
feature_columns = ['age', 'education_years', 'experience_years']
target_column = 'income'

# Train the model
performance = predictor.train_model(data, feature_columns, target_column, model_type='random_forest')

In [None]:
# Feature importance
importance_df = predictor.get_feature_importance()
if importance_df is not None:
    plotter.plot_feature_importance(importance_df)

In [None]:
# Plot predictions
plotter.plot_predictions(predictor.y_test, predictor.y_pred)

In [None]:
# Cross-validation
cv_scores = predictor.cross_validate(cv_folds=5)

### Classification Example

In [None]:
# Classification example - predict performance rating
predictor_cls = Predictor()

# Define features and target for classification
feature_columns_cls = ['age', 'education_years', 'experience_years', 'satisfaction_score']
target_column_cls = 'performance_rating'

# Train classification model
performance_cls = predictor_cls.train_model(
    data.dropna(), 
    feature_columns_cls, 
    target_column_cls, 
    model_type='random_forest'
)

## 5. Results and Insights {#results}

In [None]:
# Summary of results
print("🎉 Analysis Summary")
print("=" * 50)
print(f"📊 Dataset: {data.shape[0]} rows, {data.shape[1]} columns")
print(f"🔍 Missing values: {data.isnull().sum().sum()}")
print(f"📈 Regression R² Score: {performance['r2_score']:.4f}")
print(f"🎯 Classification Accuracy: {performance_cls['accuracy']:.4f}")
print("\n✅ All visualizations saved to outputs folder")

## Interactive Visualizations

In [None]:
# Create interactive plots
fig = plotter.create_interactive_plot(
    data, 
    'age', 
    'income', 
    color_col='department',
    plot_type='scatter'
)