# Exploratory Data Analysis (EDA) - Factory Guard AI

This notebook demonstrates data loading, cleaning, visualization, and statistical analysis using Pandas and NumPy.

## 1. Import Libraries

In [None]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from src.utils.config import Logger

# Configure visualization
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

logger = Logger()

## 2. Load Sample Data

In [None]:
# Create sample dataset for demonstration
np.random.seed(42)
n_samples = 1000

df = pd.DataFrame({
    'temperature': np.random.normal(98.6, 5, n_samples),
    'pressure': np.random.normal(1013.25, 10, n_samples),
    'vibration': np.random.exponential(2, n_samples),
    'humidity': np.random.uniform(30, 80, n_samples),
    'power_consumption': np.random.normal(500, 100, n_samples)
})

# Create target variable (anomaly detection)
df['is_anomaly'] = (df['temperature'] > 110) | (df['vibration'] > 8)

print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
print(df.head())

## 3. Exploratory Data Analysis

In [None]:
# Basic statistics
print("\n=== Descriptive Statistics ===")
print(df.describe())

print("\n=== Data Types ===")
print(df.dtypes)

print("\n=== Missing Values ===")
print(df.isnull().sum())

print("\n=== Class Distribution ===")
print(df['is_anomaly'].value_counts())

## 4. Data Visualization

In [None]:
# Distribution plots
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

axes[0, 0].hist(df['temperature'], bins=30, edgecolor='black', alpha=0.7)
axes[0, 0].set_title('Temperature Distribution')

axes[0, 1].hist(df['pressure'], bins=30, edgecolor='black', alpha=0.7, color='orange')
axes[0, 1].set_title('Pressure Distribution')

axes[1, 0].hist(df['vibration'], bins=30, edgecolor='black', alpha=0.7, color='green')
axes[1, 0].set_title('Vibration Distribution')

axes[1, 1].hist(df['humidity'], bins=30, edgecolor='black', alpha=0.7, color='red')
axes[1, 1].set_title('Humidity Distribution')

plt.tight_layout()
plt.show()

logger.info('Generated distribution plots')

## 5. Correlation Analysis

In [None]:
# Correlation matrix
numeric_cols = df.select_dtypes(include=[np.number]).columns
corr_matrix = df[numeric_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

print("\nCorrelation with target:")
target_corr = df[numeric_cols].corrwith(df['is_anomaly'].astype(int)).sort_values(ascending=False)
print(target_corr)

## 6. Anomaly Analysis

In [None]:
# Compare normal vs anomaly samples
normal_df = df[df['is_anomaly'] == False]
anomaly_df = df[df['is_anomaly'] == True]

print(f"Normal samples: {len(normal_df)}")
print(f"Anomaly samples: {len(anomaly_df)}")

# Box plots for anomaly detection
fig, axes = plt.subplots(1, 3, figsize=(14, 4))

df.boxplot(column='temperature', by='is_anomaly', ax=axes[0])
axes[0].set_title('Temperature by Anomaly Status')

df.boxplot(column='vibration', by='is_anomaly', ax=axes[1])
axes[1].set_title('Vibration by Anomaly Status')

df.boxplot(column='pressure', by='is_anomaly', ax=axes[2])
axes[2].set_title('Pressure by Anomaly Status')

plt.suptitle('')
plt.tight_layout()
plt.show()

## 7. Data Quality Checks

In [None]:
# Outlier detection using IQR method
def detect_outliers_iqr(df, columns, threshold=1.5):
    outlier_indices = []
    
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        
        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR
        
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)].index
        outlier_indices.extend(outliers)
    
    return list(set(outlier_indices))

numeric_features = ['temperature', 'pressure', 'vibration', 'humidity', 'power_consumption']
outlier_indices = detect_outliers_iqr(df, numeric_features)

print(f"Detected outliers: {len(outlier_indices)}")
print(f"Percentage: {len(outlier_indices) / len(df) * 100:.2f}%")

## 8. Key Insights

In [None]:
insights = f"""
=== KEY INSIGHTS ===

1. DATA OVERVIEW:
   - Total samples: {len(df)}
   - Features: {len(numeric_features)}
   - Class imbalance: {df['is_anomaly'].sum()} anomalies ({df['is_anomaly'].sum()/len(df)*100:.1f}%)

2. FEATURE CORRELATIONS:
   - Strongest feature correlation with anomaly: {target_corr.index[1]} ({target_corr.iloc[1]:.3f})
   - Multi-collinearity risk: Check features with |corr| > 0.8

3. DATA QUALITY:
   - Missing values: {df.isnull().sum().sum()}
   - Outliers detected: {len(outlier_indices)}
   - Duplicates: {df.duplicated().sum()}

4. RECOMMENDATIONS:
   - Handle class imbalance (use SMOTE or stratified sampling)
   - Remove or impute outliers before modeling
   - Normalize features (StandardScaler recommended)
   - Select top correlated features for baseline models
   - Consider ensemble methods (XGBoost/LightGBM) for better performance
"""

print(insights)