In [None]:
# COVID-19 Global Data Analysis - Optimized Version

**Course**: INSY 8413 | Introduction to Big Data Analytics  
**Project**: Capstone Final Exam - COVID-19 Analysis  
**Dataset**: WHO COVID-19 Global Daily Data (400k+ rows)  
**Academic Year**: 2024-2025, SEM III

---

## 🚀 Optimized Analysis for Large Dataset

This notebook provides an optimized approach to analyze the large COVID-19 dataset efficiently:
- **Memory optimization** for 21MB dataset (400k+ rows)
- **Chunked processing** to prevent memory issues
- **Strategic sampling** for faster analysis
- **Efficient feature engineering** to handle large data

---


In [None]:
## 📚 1. Setup and Library Imports


In [None]:
# Core Libraries
import pandas as pd
import numpy as np
import warnings
import os
import sys
from pathlib import Path
warnings.filterwarnings('ignore')

# Add project root to path
project_root = Path().absolute().parent
sys.path.append(str(project_root))

# Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Try to import Plotly (handle if not available)
try:
    import plotly.express as px
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots
    PLOTLY_AVAILABLE = True
    print("✅ Plotly imported successfully")
except ImportError:
    PLOTLY_AVAILABLE = False
    print("⚠️ Plotly not available, using matplotlib/seaborn only")

# Machine Learning Libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error, r2_score, silhouette_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Time and date handling
from datetime import datetime, timedelta

# Import optimized preprocessing functions
try:
    from src.optimized_preprocessing import (
        load_covid_data_optimized,
        clean_data_optimized,
        create_features_optimized,
        prepare_modeling_data_optimized,
        save_processed_data
    )
    print("✅ Optimized preprocessing functions imported")
except ImportError:
    print("⚠️ Could not import optimized functions, will use basic approach")

# Configure plotting
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

print("\n🎯 OPTIMIZED COVID-19 ANALYSIS SETUP COMPLETE")
print("=" * 60)
print(f"📊 Pandas version: {pd.__version__}")
print(f"🔢 NumPy version: {np.__version__}")
print(f"📈 Matplotlib available: {plt.__version__}")
print(f"🎨 Seaborn available: {sns.__version__}")
print(f"📊 Plotly available: {PLOTLY_AVAILABLE}")
print(f"🤖 Scikit-learn available: sklearn imported successfully")
