In [None]:
# COVID-19 ADVANCED METRICS: Mortality, Recovery & Insights
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set up styling
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("⚕️ COVID-19 ADVANCED METRICS: Mortality, Recovery & Insights")
print("=" * 70)

# Load data
print("📁 Loading data...")
df = pd.read_csv('../data/raw/covid_data.csv')
df['date'] = pd.to_datetime(df['date'])

print(f"✅ Dataset: {df.shape[0]:,} records, {df['country'].nunique()} countries")

# Calculate advanced metrics
print("\n📊 Calculating advanced metrics...")
df['mortality_rate'] = (df['deaths'] / df['confirmed'] * 100).replace([np.inf, -np.inf], np.nan)
df['recovery_rate'] = (df['recovered'] / df['confirmed'] * 100).replace([np.inf, -np.inf], np.nan)

# Get latest data for analysis
latest_data = df.loc[df.groupby('country')['date'].idxmax()].copy()
significant_countries = latest_data[latest_data['confirmed'] > 1000]

print(f"🌍 Countries with >1,000 cases: {len(significant_countries)}")

# MORTALITY RATE ANALYSIS
print("\n" + "="*50)
print("1. MORTALITY RATE ANALYSIS")
print("="*50)

mortality_stats = significant_countries['mortality_rate'].describe()
print("Mortality Rate Statistics:")
print(f"   Mean: {mortality_stats['mean']:.2f}%")
print(f"   Median: {mortality_stats['50%']:.2f}%")
print(f"   Std Dev: {mortality_stats['std']:.2f}%")
print(f"   Range: {mortality_stats['min']:.2f}% - {mortality_stats['max']:.2f}%")

# RECOVERY RATE ANALYSIS
print("\n" + "="*50)
print("2. RECOVERY RATE ANALYSIS")
print("="*50)

recovery_stats = significant_countries['recovery_rate'].describe()
print("Recovery Rate Statistics:")
print(f"   Mean: {recovery_stats['mean']:.2f}%")
print(f"   Median: {recovery_stats['50%']:.2f}%")
print(f"   Std Dev: {recovery_stats['std']:.2f}%")
print(f"   Range: {recovery_stats['min']:.2f}% - {recovery_stats['max']:.2f}%")

# GLOBAL TRENDS IN RATES
print("\n" + "="*50)
print("3. GLOBAL TRENDS IN RATES OVER TIME")
print("="*50)

global_trends = df.groupby('date').agg({
    'confirmed': 'sum',
    'deaths': 'sum',
    'recovered': 'sum'
}).reset_index()

global_trends['global_mortality'] = (global_trends['deaths'] / global_trends['confirmed'] * 100)
global_trends['global_recovery'] = (global_trends['recovered'] / global_trends['confirmed'] * 100)

print(f"Current global mortality rate: {global_trends['global_mortality'].iloc[-1]:.2f}%")
print(f"Current global recovery rate: {global_trends['global_recovery'].iloc[-1]:.2f}%")

# COMPREHENSIVE VISUALIZATION DASHBOARD
print("\n" + "="*50)
print("4. ADVANCED METRICS DASHBOARD")
print("="*50)

fig, axes = plt.subplots(3, 3, figsize=(20, 15))

# Plot 1: Mortality rate distribution
axes[0,0].hist(significant_countries['mortality_rate'].dropna(), bins=30, 
               alpha=0.7, color='red', edgecolor='black')
axes[0,0].axvline(significant_countries['mortality_rate'].mean(), 
                 color='darkred', linestyle='--', linewidth=2,
                 label=f'Mean: {significant_countries["mortality_rate"].mean():.2f}%')
axes[0,0].set_title('Distribution of Mortality Rates', fontweight='bold', fontsize=12)
axes[0,0].set_xlabel('Mortality Rate (%)')
axes[0,0].set_ylabel('Number of Countries')
axes[0,0].legend()
axes[0,0].grid(True, alpha=0.3)

# Plot 2: Recovery rate distribution
axes[0,1].hist(significant_countries['recovery_rate'].dropna(), bins=30, 
               alpha=0.7, color='green', edgecolor='black')
axes[0,1].axvline(significant_countries['recovery_rate'].mean(), 
                 color='darkgreen', linestyle='--', linewidth=2,
                 label=f'Mean: {significant_countries["recovery_rate"].mean():.2f}%')
axes[0,1].set_title('Distribution of Recovery Rates', fontweight='bold', fontsize=12)
axes[0,1].set_xlabel('Recovery Rate (%)')
axes[0,1].set_ylabel('Number of Countries')
axes[0,1].legend()
axes[0,1].grid(True, alpha=0.3)

# Plot 3: Mortality vs Recovery scatter
scatter = axes[0,2].scatter(significant_countries['mortality_rate'], 
                           significant_countries['recovery_rate'],
                           s=significant_countries['confirmed']/10000,
                           alpha=0.6, cmap='viridis')
axes[0,2].set_xlabel('Mortality Rate (%)')
axes[0,2].set_ylabel('Recovery Rate (%)')
axes[0,2].set_title('Mortality Rate vs Recovery Rate', fontweight='bold', fontsize=12)
axes[0,2].grid(True, alpha=0.3)

# Add correlation
try:
    x = significant_countries['mortality_rate'].dropna()
    y = significant_countries['recovery_rate'].dropna()
    mask = (x > 0) & (x < 20) & (y > 0) & (y < 100)
    if mask.sum() > 2:
        slope, intercept, r_value, p_value, std_err = stats.linregress(x[mask], y[mask])
        x_line = np.linspace(x[mask].min(), x[mask].max(), 100)
        y_line = slope * x_line + intercept
        axes[0,2].plot(x_line, y_line, color='red', linestyle='--', 
                      label=f'Correlation: r={r_value:.2f}')
        axes[0,2].legend()
except:
    pass

# Plot 4: Global mortality rate trend
axes[1,0].plot(global_trends['date'], global_trends['global_mortality'], 
               linewidth=2, color='red')
axes[1,0].set_title('Global Mortality Rate Trend', fontweight='bold', fontsize=12)
axes[1,0].set_ylabel('Mortality Rate (%)')
axes[1,0].tick_params(axis='x', rotation=45)
axes[1,0].grid(True, alpha=0.3)

# Plot 5: Global recovery rate trend
axes[1,1].plot(global_trends['date'], global_trends['global_recovery'], 
               linewidth=2, color='green')
axes[1,1].set_title('Global Recovery Rate Trend', fontweight='bold', fontsize=12)
axes[1,1].set_ylabel('Recovery Rate (%)')
axes[1,1].tick_params(axis='x', rotation=45)
axes[1,1].grid(True, alpha=0.3)

# Plot 6: Countries with extreme mortality rates
extreme_mortality = significant_countries[significant_countries['confirmed'] > 10000]
top_mortality = extreme_mortality.nlargest(10, 'mortality_rate')
bottom_mortality = extreme_mortality.nsmallest(10, 'mortality_rate')

x_pos = np.arange(len(top_mortality))
axes[1,2].barh(x_pos, top_mortality['mortality_rate'], color='red', alpha=0.7, label='Highest')
axes[1,2].barh(x_pos + 0.4, bottom_mortality['mortality_rate'], color='blue', alpha=0.7, label='Lowest')
axes[1,2].set_yticks(x_pos + 0.2)
axes[1,2].set_yticklabels([f"{top}\n{bot}" for top, bot in zip(top_mortality['country'], bottom_mortality['country'])])
axes[1,2].set_title('Extreme Mortality Rates (Top & Bottom 10)', fontweight='bold', fontsize=10)
axes[1,2].set_xlabel('Mortality Rate (%)')
axes[1,2].legend()

# Plot 7: Mortality rate by case volume
bins = [0, 1000, 10000, 100000, 1000000, np.inf]
labels = ['<1K', '1K-10K', '10K-100K', '100K-1M', '>1M']
significant_countries['case_volume'] = pd.cut(significant_countries['confirmed'], bins=bins, labels=labels)

mortality_by_volume = significant_countries.groupby('case_volume')['mortality_rate'].mean()
axes[2,0].bar(mortality_by_volume.index.astype(str), mortality_by_volume.values, color='orange')
axes[2,0].set_title('Average Mortality Rate by Case Volume', fontweight='bold', fontsize=12)
axes[2,0].set_xlabel('Case Volume Category')
axes[2,0].set_ylabel('Average Mortality Rate (%)')
axes[2,0].tick_params(axis='x', rotation=45)
axes[2,0].grid(True, alpha=0.3)

# Plot 8: Recovery rate by case volume
recovery_by_volume = significant_countries.groupby('case_volume')['recovery_rate'].mean()
axes[2,1].bar(recovery_by_volume.index.astype(str), recovery_by_volume.values, color='lightgreen')
axes[2,1].set_title('Average Recovery Rate by Case Volume', fontweight='bold', fontsize=12)
axes[2,1].set_xlabel('Case Volume Category')
axes[2,1].set_ylabel('Average Recovery Rate (%)')
axes[2,1].tick_params(axis='x', rotation=45)
axes[2,1].grid(True, alpha=0.3)

# Plot 9: Rate correlation heatmap
correlation_data = significant_countries[['confirmed', 'deaths', 'recovered', 'mortality_rate', 'recovery_rate']].corr()
sns.heatmap(correlation_data, annot=True, cmap='coolwarm', center=0, ax=axes[2,2])
axes[2,2].set_title('Metrics Correlation Heatmap', fontweight='bold', fontsize=12)

plt.tight_layout()
plt.show()

# KEY INSIGHTS AND RECOMMENDATIONS
print("\n" + "="*50)
print("5. KEY INSIGHTS AND RECOMMENDATIONS")
print("="*50)

print("🔍 KEY FINDINGS:")

# Mortality insights
mortality_std = significant_countries['mortality_rate'].std()
if mortality_std > 2:
    print(f"   • High variability in mortality rates (std: {mortality_std:.2f}%)")
    print("   → Suggests different healthcare system effectiveness")

# Recovery insights
recovery_std = significant_countries['recovery_rate'].std()
if recovery_std > 10:
    print(f"   • Wide range in recovery rates (std: {recovery_std:.2f}%)")
    print("   → Indicates varying recovery reporting standards")

# Correlation insights
try:
    mortality_recovery_corr = significant_countries['mortality_rate'].corr(significant_countries['recovery_rate'])
    if abs(mortality_recovery_corr) > 0.3:
        direction = "negative" if mortality_recovery_corr < 0 else "positive"
        print(f"   • {direction.capitalize()} correlation between mortality and recovery rates")
        print("   → Countries with high mortality tend to have low recovery rates")
except:
    pass

print("\n🏆 COUNTRIES WITH OUTSTANDING METRICS:")

print("\n   Lowest Mortality Rates (min 10k cases):")
low_mortality_countries = significant_countries[significant_countries['confirmed'] > 10000].nsmallest(3, 'mortality_rate')
for idx, row in low_mortality_countries.iterrows():
    print(f"     • {row['country']}: {row['mortality_rate']:.2f}%")

print("\n   Highest Recovery Rates (min 10k cases):")
high_recovery_countries = significant_countries[significant_countries['confirmed'] > 10000].nlargest(3, 'recovery_rate')
for idx, row in high_recovery_countries.iterrows():
    print(f"     • {row['country']}: {row['recovery_rate']:.2f}%")

print(f"\n{'='*70}")
print("✅ ADVANCED METRICS ANALYSIS COMPLETED SUCCESSFULLY!")
print(f"{'='*70}")