# Advanced EV Health Monitoring and Predictive Maintenance
## 01 - Dataset Loading and Initial Exploration

This notebook explores the two main datasets:
1. **EVIoT-PredictiveMaint Dataset** (15-minute intervals) - Archive folder
2. **EV Sensors: Driving Pattern Diagnostics** (2020-24) - Archive(1) folder

The goal is to understand the data structure, quality, and potential for integration.

In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

## 1. Load EVIoT-PredictiveMaint Dataset (15-min intervals)

In [None]:
# Load the predictive maintenance dataset
maintenance_data_path = '../archive/EV_Predictive_Maintenance_Dataset_15min.csv'

try:
    maintenance_df = pd.read_csv(maintenance_data_path)
    print(f"✅ Loaded maintenance dataset: {maintenance_df.shape}")
    print(f"📊 Columns: {list(maintenance_df.columns)}")
except FileNotFoundError:
    print("❌ File not found. Please check the path.")
except Exception as e:
    print(f"❌ Error loading file: {e}")

In [None]:
# Display basic information about the maintenance dataset
print("=== MAINTENANCE DATASET INFO ===")
print(f"Shape: {maintenance_df.shape}")
print(f"\nData types:")
print(maintenance_df.dtypes)
print(f"\nMemory usage: {maintenance_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

In [None]:
# Preview the first few rows
print("=== FIRST 5 ROWS ===")
display(maintenance_df.head())

In [None]:
# Check for missing values
print("=== MISSING VALUES ANALYSIS ===")
missing_values = maintenance_df.isnull().sum()
missing_percentage = (missing_values / len(maintenance_df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Missing Percentage': missing_percentage
}).sort_values('Missing Percentage', ascending=False)

# Display only columns with missing values
missing_df_filtered = missing_df[missing_df['Missing Count'] > 0]
if not missing_df_filtered.empty:
    display(missing_df_filtered)
else:
    print("✅ No missing values found!")

## 2. Load EV Sensors: Driving Pattern Diagnostics (2020-24)

In [None]:
# Load all driving pattern datasets
import os

pattern_data_path = '../archive (1)/'
pattern_files = {
    'rare_user': 'rare_user.csv',
    'moderate_user': 'moderate_user.csv',
    'heavy_user': 'heavy_user.csv',
    'daily_user': 'daily_user.csv'
}

pattern_dfs = {}

for user_type, filename in pattern_files.items():
    file_path = os.path.join(pattern_data_path, filename)
    try:
        df = pd.read_csv(file_path)
        df['user_type'] = user_type  # Add user type identifier
        pattern_dfs[user_type] = df
        print(f"✅ Loaded {user_type}: {df.shape}")
    except FileNotFoundError:
        print(f"❌ File not found: {filename}")
    except Exception as e:
        print(f"❌ Error loading {filename}: {e}")

print(f"\n📊 Total pattern datasets loaded: {len(pattern_dfs)}")

In [None]:
# Combine all pattern datasets into one dataframe
if pattern_dfs:
    combined_patterns_df = pd.concat(pattern_dfs.values(), ignore_index=True)
    print(f"✅ Combined patterns dataset: {combined_patterns_df.shape}")
    print(f"📊 Columns: {list(combined_patterns_df.columns)}")
    
    # Check distribution of user types
    print(f"\n📈 User type distribution:")
    print(combined_patterns_df['user_type'].value_counts())
else:
    print("❌ No pattern datasets loaded")

In [None]:
# Display sample from patterns dataset
print("=== PATTERNS DATASET SAMPLE ===")
display(combined_patterns_df.head())

print(f"\n=== DATA TYPES ===")
print(combined_patterns_df.dtypes)

## 3. Data Quality Assessment

In [None]:
# Statistical summary for maintenance dataset
print("=== MAINTENANCE DATASET STATISTICS ===")
display(maintenance_df.describe())

In [None]:
# Statistical summary for patterns dataset
print("=== PATTERNS DATASET STATISTICS ===")
# Select only numeric columns for description
numeric_cols = combined_patterns_df.select_dtypes(include=[np.number]).columns
display(combined_patterns_df[numeric_cols].describe())

In [None]:
# Check for duplicate rows
print("=== DUPLICATE ANALYSIS ===")
maintenance_duplicates = maintenance_df.duplicated().sum()
patterns_duplicates = combined_patterns_df.duplicated().sum()

print(f"Maintenance dataset duplicates: {maintenance_duplicates}")
print(f"Patterns dataset duplicates: {patterns_duplicates}")

## 4. Initial Visualizations

In [None]:
# Visualize key sensor distributions for maintenance dataset
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Key Sensor Distributions - Maintenance Dataset', fontsize=16, fontweight='bold')

# Select key sensors to visualize
key_sensors = ['SoC', 'SoH', 'Battery_Temperature', 'Motor_Temperature', 'RUL', 'Failure_Probability']

for i, sensor in enumerate(key_sensors):
    if sensor in maintenance_df.columns:
        row = i // 3
        col = i % 3
        axes[row, col].hist(maintenance_df[sensor].dropna(), bins=50, alpha=0.7, edgecolor='black')
        axes[row, col].set_title(f'{sensor} Distribution')
        axes[row, col].set_xlabel(sensor)
        axes[row, col].set_ylabel('Frequency')
        axes[row, col].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Visualize SOC and SOH patterns across different user types
fig, axes = plt.subplots(2, 2, figsize=(16, 10))
fig.suptitle('SOC and SOH Patterns by User Type', fontsize=16, fontweight='bold')

user_types = combined_patterns_df['user_type'].unique()

for i, user_type in enumerate(user_types):
    user_data = combined_patterns_df[combined_patterns_df['user_type'] == user_type]
    
    row = i // 2
    col = i % 2
    
    # Plot SOC vs SOH
    scatter = axes[row, col].scatter(user_data['SOC'], user_data['SOH'], 
                                   alpha=0.6, s=1)
    axes[row, col].set_title(f'{user_type.replace("_", " ").title()} - SOC vs SOH')
    axes[row, col].set_xlabel('State of Charge (%)')
    axes[row, col].set_ylabel('State of Health (%)')
    axes[row, col].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Interactive visualization using Plotly
# SOC distribution by user type
fig = px.box(combined_patterns_df, x='user_type', y='SOC', 
             title='State of Charge Distribution by User Type',
             labels={'user_type': 'User Type', 'SOC': 'State of Charge (%)'})

fig.update_layout(height=500)
fig.show()

In [None]:
# Battery temperature patterns
fig = px.violin(combined_patterns_df, x='user_type', y='Battery_Temp',
                title='Battery Temperature Distribution by User Type',
                labels={'user_type': 'User Type', 'Battery_Temp': 'Battery Temperature (°C)'})

fig.update_layout(height=500)
fig.show()

## 5. Common Features Analysis

In [None]:
# Identify common features between datasets
maintenance_cols = set(maintenance_df.columns)
patterns_cols = set(combined_patterns_df.columns)

common_features = maintenance_cols.intersection(patterns_cols)
maintenance_only = maintenance_cols - patterns_cols
patterns_only = patterns_cols - maintenance_cols

print("=== FEATURE COMPARISON ===")
print(f"Common features ({len(common_features)}): {sorted(common_features)}")
print(f"\nMaintenance dataset only ({len(maintenance_only)}): {sorted(maintenance_only)}")
print(f"\nPatterns dataset only ({len(patterns_only)}): {sorted(patterns_only)}")

In [None]:
# Correlation analysis for common numeric features
if common_features:
    # Get numeric common features
    numeric_common = []
    for feature in common_features:
        if (pd.api.types.is_numeric_dtype(maintenance_df[feature]) and 
            pd.api.types.is_numeric_dtype(combined_patterns_df[feature])):
            numeric_common.append(feature)
    
    if numeric_common:
        print(f"Numeric common features: {numeric_common}")
        
        # Create correlation matrix for maintenance dataset
        plt.figure(figsize=(10, 8))
        corr_maintenance = maintenance_df[numeric_common].corr()
        sns.heatmap(corr_maintenance, annot=True, cmap='coolwarm', center=0,
                    square=True, fmt='.2f')
        plt.title('Correlation Matrix - Maintenance Dataset (Common Features)')
        plt.tight_layout()
        plt.show()
        
        # Create correlation matrix for patterns dataset
        plt.figure(figsize=(10, 8))
        corr_patterns = combined_patterns_df[numeric_common].corr()
        sns.heatmap(corr_patterns, annot=True, cmap='coolwarm', center=0,
                    square=True, fmt='.2f')
        plt.title('Correlation Matrix - Patterns Dataset (Common Features)')
        plt.tight_layout()
        plt.show()
    else:
        print("No numeric common features found for correlation analysis.")
else:
    print("No common features found between datasets.")

## 6. Temporal Analysis

In [None]:
# Parse timestamps for temporal analysis
# For maintenance dataset
if 'Timestamp' in maintenance_df.columns:
    maintenance_df['Timestamp'] = pd.to_datetime(maintenance_df['Timestamp'])
    print(f"Maintenance dataset time range: {maintenance_df['Timestamp'].min()} to {maintenance_df['Timestamp'].max()}")
    print(f"Maintenance dataset frequency: {(maintenance_df['Timestamp'].iloc[1] - maintenance_df['Timestamp'].iloc[0])}")

# For patterns dataset (assuming first column is timestamp)
timestamp_col = combined_patterns_df.columns[0]
if timestamp_col:
    combined_patterns_df[timestamp_col] = pd.to_datetime(combined_patterns_df[timestamp_col])
    print(f"\nPatterns dataset time range: {combined_patterns_df[timestamp_col].min()} to {combined_patterns_df[timestamp_col].max()}")
    
    # Check frequency for each user type
    for user_type in combined_patterns_df['user_type'].unique():
        user_data = combined_patterns_df[combined_patterns_df['user_type'] == user_type].sort_values(timestamp_col)
        if len(user_data) > 1:
            freq = user_data[timestamp_col].iloc[1] - user_data[timestamp_col].iloc[0]
            print(f"{user_type} frequency: {freq}")

In [None]:
# Time series plot for SOC patterns
fig, axes = plt.subplots(2, 2, figsize=(20, 12))
fig.suptitle('SOC Time Series by User Type (First 1000 records)', fontsize=16, fontweight='bold')

user_types = combined_patterns_df['user_type'].unique()

for i, user_type in enumerate(user_types):
    user_data = combined_patterns_df[combined_patterns_df['user_type'] == user_type].head(1000)
    
    row = i // 2
    col = i % 2
    
    axes[row, col].plot(user_data[timestamp_col], user_data['SOC'], alpha=0.7, linewidth=0.5)
    axes[row, col].set_title(f'{user_type.replace("_", " ").title()} - SOC Over Time')
    axes[row, col].set_xlabel('Time')
    axes[row, col].set_ylabel('State of Charge (%)')
    axes[row, col].grid(True, alpha=0.3)
    axes[row, col].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 7. Key Insights Summary

In [None]:
print("=== DATA EXPLORATION SUMMARY ===")
print("\n📊 Dataset Overview:")
print(f"• Maintenance dataset: {maintenance_df.shape[0]:,} records, {maintenance_df.shape[1]} features")
print(f"• Patterns dataset: {combined_patterns_df.shape[0]:,} records, {combined_patterns_df.shape[1]} features")
print(f"• Total combined records: {maintenance_df.shape[0] + combined_patterns_df.shape[0]:,}")

print("\n🎯 Key Targets:")
if 'RUL' in maintenance_df.columns:
    print(f"• RUL (Remaining Useful Life): {maintenance_df['RUL'].min():.1f} - {maintenance_df['RUL'].max():.1f}")
if 'Failure_Probability' in maintenance_df.columns:
    print(f"• Failure Probability: {maintenance_df['Failure_Probability'].min():.3f} - {maintenance_df['Failure_Probability'].max():.3f}")

print("\n👥 User Profiles:")
for user_type in combined_patterns_df['user_type'].unique():
    count = len(combined_patterns_df[combined_patterns_df['user_type'] == user_type])
    print(f"• {user_type.replace('_', ' ').title()}: {count:,} records")

print("\n🔗 Integration Potential:")
print(f"• Common features: {len(common_features)} features can be directly compared")
print(f"• Temporal alignment: Both datasets span multiple years with regular intervals")
print(f"• Feature complementarity: {len(maintenance_only)} unique maintenance features + {len(patterns_only)} unique pattern features")

print("\n✅ Next Steps:")
print("1. Data preprocessing and cleaning")
print("2. Feature engineering and temporal alignment")
print("3. Dataset integration and harmonization")
print("4. Predictive model development")
print("5. Personalized recommendation system")

## 8. Save Processed Data for Next Steps

In [None]:
# Create a summary report
summary_report = {
    'datasets': {
        'maintenance': {
            'shape': maintenance_df.shape,
            'columns': list(maintenance_df.columns),
            'memory_mb': maintenance_df.memory_usage(deep=True).sum() / 1024**2
        },
        'patterns': {
            'shape': combined_patterns_df.shape,
            'columns': list(combined_patterns_df.columns),
            'user_types': list(combined_patterns_df['user_type'].unique()),
            'memory_mb': combined_patterns_df.memory_usage(deep=True).sum() / 1024**2
        }
    },
    'integration': {
        'common_features': list(common_features),
        'maintenance_only': list(maintenance_only),
        'patterns_only': list(patterns_only)
    }
}

# Save summary for reference
import json
with open('../reports/data_exploration_summary.json', 'w') as f:
    json.dump(summary_report, f, indent=2, default=str)

print("✅ Data exploration complete!")
print("📄 Summary saved to: ../reports/data_exploration_summary.json")
print("📋 Ready for Phase 2: Data Integration & Preprocessing")