# Data Exploration and Analysis

This notebook demonstrates basic data exploration and analysis workflows.

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.data_processing.processor import DataProcessor
from src.models.example_model import ExampleModel

%matplotlib inline
sns.set_style('whitegrid')

## 1. Load and Explore Data

In [None]:
# Generate sample data
np.random.seed(42)
data = pd.DataFrame({
    'feature1': np.random.randn(100),
    'feature2': np.random.randn(100),
    'feature3': np.random.randn(100),
    'target': np.random.randint(0, 2, 100)
})

print("Data shape:", data.shape)
data.head()

In [None]:
# Summary statistics
data.describe()

## 2. Data Visualization

In [None]:
# Distribution plots
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for idx, col in enumerate(['feature1', 'feature2', 'feature3']):
    axes[idx].hist(data[col], bins=20, edgecolor='black')
    axes[idx].set_title(f'Distribution of {col}')
    axes[idx].set_xlabel('Value')
    axes[idx].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlation Heatmap')
plt.show()

## 3. Data Processing

In [None]:
# Initialize processor
processor = DataProcessor()
processor.data = data

# Clean data
cleaned_data = processor.clean_data()
print("Cleaned data shape:", cleaned_data.shape)

In [None]:
# Get statistics
stats = processor.get_statistics()
print("Statistics:")
print(f"Shape: {stats['shape']}")
print(f"Columns: {stats['columns']}")
print(f"Missing values: {stats['missing_values']}")

## 4. Model Training

In [None]:
# Prepare features and target
X = data[['feature1', 'feature2', 'feature3']].values
y = data['target'].values

# Initialize and train model
model = ExampleModel()
model.train(X, y)

print("Model trained successfully!")

In [None]:
# Make prediction
sample = [0.5, -0.3, 0.8]
prediction = model.predict(sample)
print("Prediction:", prediction)

## 5. Conclusions

This notebook demonstrates:
- Data loading and exploration
- Visualization techniques
- Data processing pipelines
- Model training and prediction

Next steps could include:
- Feature engineering
- Hyperparameter tuning
- Cross-validation
- Model evaluation metrics