In [None]:
# COVID-19 DATA EXPLORATION & QUALITY ANALYSIS
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set up styling
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

print("🦠 COVID-19 DATA EXPLORATION & QUALITY ANALYSIS")
print("=" * 60)

# Load data
print("📁 Loading data...")
df = pd.read_csv('../data/raw/covid_data.csv')
df['date'] = pd.to_datetime(df['date'])

print(f"✅ Dataset loaded: {df.shape[0]:,} records, {df.shape[1]} columns")

# BASIC DATASET OVERVIEW
print("\n" + "="*50)
print("1. DATASET OVERVIEW")
print("="*50)

print(f"📅 Date range: {df['date'].min().date()} to {df['date'].max().date()}")
print(f"🌍 Countries: {df['country'].nunique()}")
print(f"📊 Total days of data: {df['date'].nunique()}")

print("\nFirst 5 rows:")
display(df.head())

print("\nDataset info:")
df.info()

print("\nBasic statistics:")
display(df.describe())

# DATA QUALITY CHECKS
print("\n" + "="*50)
print("2. DATA QUALITY CHECKS")
print("="*50)

# Missing values
print("🔍 Missing values analysis:")
missing = df.isnull().sum()
missing_pct = (df.isnull().sum() / len(df)) * 100
missing_df = pd.DataFrame({'Missing Count': missing, 'Missing %': missing_pct})
display(missing_df[missing_df['Missing Count'] > 0])

# Check for negative values
print("\n🔍 Checking for negative values:")
numeric_cols = ['confirmed', 'deaths', 'recovered', 'active', 'population']
for col in numeric_cols:
    if col in df.columns:
        negatives = (df[col] < 0).sum()
        if negatives > 0:
            print(f"⚠️  {col}: {negatives} negative values")
        else:
            print(f"✅ {col}: No negative values")

# Check date completeness by country
print("\n🔍 Date coverage by country:")
date_coverage = df.groupby('country')['date'].agg(['min', 'max', 'nunique'])
date_coverage['duration_days'] = (date_coverage['max'] - date_coverage['min']).dt.days
print(f"Average days per country: {date_coverage['nunique'].mean():.1f} days")
print(f"Countries with complete data: {(date_coverage['nunique'] == date_coverage['duration_days'] + 1).sum()}")

# DATA DISTRIBUTION ANALYSIS
print("\n" + "="*50)
print("3. DATA DISTRIBUTION ANALYSIS")
print("="*50)

# Country analysis
print("🏛️ Country record distribution:")
country_counts = df['country'].value_counts()
print(f"Most records: {country_counts.idxmax()} ({country_counts.max()} records)")
print(f"Least records: {country_counts.idxmin()} ({country_counts.min()} records)")

# Latest data snapshot
latest_date = df['date'].max()
latest_data = df[df['date'] == latest_date]
print(f"\n📈 Latest data snapshot ({latest_date.date()}):")
print(f"Total countries with data: {len(latest_data)}")
print(f"Total confirmed cases: {latest_data['confirmed'].sum():,}")
print(f"Total deaths: {latest_data['deaths'].sum():,}")

# VISUALIZATIONS
print("\n" + "="*50)
print("4. EXPLORATORY VISUALIZATIONS")
print("="*50)

fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# Plot 1: Records per country (top 15)
df['country'].value_counts().head(15).plot(kind='barh', ax=axes[0,0], color='skyblue')
axes[0,0].set_title('Records per Country (Top 15)')
axes[0,0].set_xlabel('Number of Records')

# Plot 2: Data collection timeline
daily_records = df.groupby('date').size()
axes[0,1].plot(daily_records.index, daily_records.values, linewidth=2)
axes[0,1].set_title('Data Records Over Time')
axes[0,1].set_xlabel('Date')
axes[0,1].set_ylabel('Records per Day')
axes[0,1].tick_params(axis='x', rotation=45)

# Plot 3: Missing values heatmap
sns.heatmap(df.isnull(), ax=axes[0,2], cbar=True, cmap='viridis')
axes[0,2].set_title('Missing Values Heatmap')

# Plot 4: Distribution of confirmed cases
axes[1,0].hist(df['confirmed'], bins=50, alpha=0.7, color='lightcoral', edgecolor='black')
axes[1,0].set_title('Distribution of Confirmed Cases')
axes[1,0].set_xlabel('Confirmed Cases')
axes[1,0].set_ylabel('Frequency')
axes[1,0].set_yscale('log')

# Plot 5: Distribution of deaths
axes[1,1].hist(df['deaths'], bins=50, alpha=0.7, color='orange', edgecolor='black')
axes[1,1].set_title('Distribution of Deaths')
axes[1,1].set_xlabel('Deaths')
axes[1,1].set_ylabel('Frequency')
axes[1,1].set_yscale('log')

# Plot 6: Top 10 countries by latest cases
top_10_latest = latest_data.nlargest(10, 'confirmed')
axes[1,2].barh(top_10_latest['country'], top_10_latest['confirmed'], color='lightgreen')
axes[1,2].set_title('Top 10 Countries - Latest Confirmed Cases')
axes[1,2].set_xlabel('Confirmed Cases')

for ax in axes.flat:
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# DATA QUALITY SUMMARY
print("\n" + "="*50)
print("5. DATA QUALITY SUMMARY")
print("="*50)

print("✅ STRENGTHS:")
print(f"   - Large dataset: {df.shape[0]:,} records")
print(f"   - Good country coverage: {df['country'].nunique()} countries")
print(f"   - Good time coverage: {df['date'].nunique()} days")

print("\n⚠️  AREAS TO REVIEW:")
if missing.sum() > 0:
    print(f"   - Missing values detected in {len(missing[missing > 0])} columns")
if date_coverage['nunique'].std() > date_coverage['nunique'].mean() * 0.5:
    print("   - Inconsistent date coverage across countries")

print("\n🎯 RECOMMENDATIONS FOR ANALYSIS:")
print("   - Consider filtering countries with incomplete data")
print("   - Handle missing values appropriately")
print("   - Verify extreme values in numeric columns")

print(f"\n{'='*60}")
print("✅ DATA EXPLORATION COMPLETED SUCCESSFULLY!")
print(f"{'='*60}")