In [None]:
# %% [markdown]
# # COVID-19 Global Data Tracker
# 
# This notebook analyzes global COVID-19 trends including cases, deaths, and vaccination progress.

# %%
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from datetime import datetime
import ipywidgets as widgets
from IPython.display import display as ipydisplay
import warnings
from ipywidgets import interactive

# Filter out warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# %% [markdown]
# ## 1. Data Collection & Loading

# %%
# Load the dataset from raw GitHub URL
try:
    raw_url = "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv"
    covid_df = pd.read_csv(raw_url)
    
    # Convert date column and clean data
    covid_df['date'] = pd.to_datetime(covid_df['date'])
    min_date = covid_df['date'].min().date()
    max_date = covid_df['date'].max().date()
    
    print("Dataset loaded successfully from GitHub!")
    print(f"Shape: {covid_df.shape}")
    print(f"Date range: {min_date} to {max_date}")
    
    # Display first few rows
    print("\nFirst 5 rows:")
    display(covid_df.head())
    
except Exception as e:
    print(f"Error loading dataset: {str(e)}")
    covid_df = pd.DataFrame()

# %% [markdown]
# ## 2. Data Cleaning Function

# %%
def clean_covid_data(df, countries=None, start_date=None, end_date=None):
    """Clean and filter COVID-19 data"""
    if df.empty:
        return pd.DataFrame()
    
    # Make a copy
    filtered_df = df.copy()
    
    # Filter by countries if specified
    if countries:
        filtered_df = filtered_df[filtered_df['location'].isin(countries)]
    
    # Filter by date range if specified
    if start_date and end_date:
        start_date = pd.to_datetime(start_date)
        end_date = pd.to_datetime(end_date)
        filtered_df = filtered_df[
            (filtered_df['date'] >= start_date) & 
            (filtered_df['date'] <= end_date)
        ]
    
    # Handle missing values for key metrics
    key_metrics = ['total_cases', 'new_cases', 'total_deaths', 'new_deaths', 
                   'icu_patients', 'hosp_patients', 'weekly_icu_admissions',
                   'total_vaccinations', 'people_vaccinated', 'population']
    
    for metric in key_metrics:
        if metric in filtered_df.columns:
            if metric.startswith('total_'):
                filtered_df[metric] = filtered_df[metric].fillna(method='ffill').fillna(0)
            else:
                filtered_df[metric] = filtered_df[metric].fillna(0)
    
    # Calculate derived metrics with error handling
    filtered_df['death_rate'] = np.where(
        filtered_df['total_cases'] > 0,
        filtered_df['total_deaths'] / filtered_df['total_cases'],
        0
    )
    
    if 'people_vaccinated' in filtered_df.columns and 'population' in filtered_df.columns:
        filtered_df['vaccination_rate'] = np.where(
            filtered_df['population'] > 0,
            filtered_df['people_vaccinated'] / filtered_df['population'],
            0
        )
    
    # Hospitalization metrics
    if 'hosp_patients' in filtered_df.columns and 'population' in filtered_df.columns:
        filtered_df['hosp_patients_per_million'] = (filtered_df['hosp_patients'] / filtered_df['population']) * 1e6
    
    return filtered_df

# %% [markdown]
# ## 3. Interactive Analysis Dashboard

# %%
if not covid_df.empty:
    # Get unique countries
    all_countries = sorted(covid_df['location'].unique())
    
    # Create widgets
    country_select = widgets.SelectMultiple(
        options=all_countries,
        value=['United States', 'India', 'Brazil', 'Germany'],
        description='Countries:',
        disabled=False,
        layout={'width': '500px'}
    )
    
    start_date_picker = widgets.DatePicker(
        description='Start Date:',
        value=min_date,
        disabled=False
    )
    
    end_date_picker = widgets.DatePicker(
        description='End Date:',
        value=max_date,
        disabled=False
    )
    
    metrics_select = widgets.SelectMultiple(
        options=['cases', 'deaths', 'vaccinations', 'hospitalizations'],
        value=['cases', 'deaths'],
        description='Metrics:',
        disabled=False
    )
    
    # Create analysis button
    analyze_button = widgets.Button(
        description='Analyze Data',
        disabled=False,
        button_style='success',
        tooltip='Click to analyze with current selections'
    )
    
    # Display widgets
    print("Configure your analysis:")
    display(widgets.VBox([
        widgets.HBox([country_select]),
        widgets.HBox([start_date_picker, end_date_picker]),
        widgets.HBox([metrics_select]),
        analyze_button
    ]))
    
    # Output area for results
    output = widgets.Output()
    display(output)
    
    # Analysis function
    def run_analysis(button):
        with output:
            output.clear_output()
            
            # Get user selections
            selected_countries = list(country_select.value)
            start_date = start_date_picker.value
            end_date = end_date_picker.value
            selected_metrics = list(metrics_select.value)
            
            # Clean and filter data
            filtered_df = clean_covid_data(
                covid_df,
                countries=selected_countries,
                start_date=start_date,
                end_date=end_date
            )
            
            if filtered_df.empty:
                print("No data available for the selected filters")
                return
            
            # Create visualizations
            create_visualizations(filtered_df, selected_metrics, selected_countries)
    
    # Register button click handler
    analyze_button.on_click(run_analysis)

# %% [markdown]
# ## 4. Visualization Functions

# %%
def create_visualizations(df, metrics, countries):
    """Create visualizations based on selected metrics"""
    plt.close('all')
    
    # Determine grid size based on number of metrics
    num_metrics = len(metrics)
    fig, axes = plt.subplots(num_metrics, 2, figsize=(18, 5*num_metrics))
    if num_metrics == 1:
        axes = axes.reshape(1, -1)
    
    row = 0
    
    # Cases Analysis
    if 'cases' in metrics:
        # Total Cases
        for country in countries:
            country_data = df[df['location'] == country]
            axes[row, 0].plot(country_data['date'], country_data['total_cases'], label=country)
        axes[row, 0].set_title('Total COVID-19 Cases')
        axes[row, 0].set_ylabel('Cases')
        axes[row, 0].legend()
        axes[row, 0].grid(True)
        
        # New Cases (7-day average)
        for country in countries:
            country_data = df[df['location'] == country]
            axes[row, 1].plot(country_data['date'], country_data['new_cases'].rolling(7).mean(), label=country)
        axes[row, 1].set_title('Daily New Cases (7-day avg)')
        axes[row, 1].legend()
        axes[row, 1].grid(True)
        
        row += 1
    
    # Deaths Analysis
    if 'deaths' in metrics:
        # Total Deaths
        for country in countries:
            country_data = df[df['location'] == country]
            axes[row, 0].plot(country_data['date'], country_data['total_deaths'], label=country)
        axes[row, 0].set_title('Total COVID-19 Deaths')
        axes[row, 0].set_ylabel('Deaths')
        axes[row, 0].legend()
        axes[row, 0].grid(True)
        
        # Death Rate
        latest_data = df[df['date'] == df['date'].max()]
        death_rates = latest_data[['location', 'death_rate']].sort_values('death_rate', ascending=False)
        death_rates['death_rate'] = death_rates['death_rate'] * 100
        
        if len(death_rates) > 0:
            axes[row, 1].bar(death_rates['location'], death_rates['death_rate'], color='purple')
            axes[row, 1].set_title('Death Rate (%)')
            axes[row, 1].set_ylabel('Percentage')
            axes[row, 1].tick_params(axis='x', rotation=45)
        
        row += 1
    
    # Vaccination Analysis
    if 'vaccinations' in metrics and 'people_vaccinated' in df.columns:
        # Total Vaccinations
        for country in countries:
            country_data = df[df['location'] == country]
            axes[row, 0].plot(country_data['date'], country_data['people_vaccinated'], label=country)
        axes[row, 0].set_title('Total Vaccinations')
        axes[row, 0].set_ylabel('People Vaccinated')
        axes[row, 0].legend()
        axes[row, 0].grid(True)
        
        # Vaccination Rate
        if 'vaccination_rate' in df.columns:
            latest_data = df[df['date'] == df['date'].max()]
            vax_rates = latest_data[['location', 'vaccination_rate']].sort_values('vaccination_rate', ascending=False)
            vax_rates['vaccination_rate'] = vax_rates['vaccination_rate'] * 100
            
            if len(vax_rates) > 0:
                axes[row, 1].bar(vax_rates['location'], vax_rates['vaccination_rate'], color='green')
                axes[row, 1].set_title('Vaccination Rate (% Population)')
                axes[row, 1].set_ylabel('Percentage')
                axes[row, 1].tick_params(axis='x', rotation=45)
        
        row += 1
    
    # Hospitalization Analysis
    if 'hospitalizations' in metrics:
        # ICU Patients
        if 'icu_patients' in df.columns:
            for country in countries:
                country_data = df[df['location'] == country]
                if country_data['icu_patients'].sum() > 0:
                    axes[row, 0].plot(country_data['date'], country_data['icu_patients'], label=country)
            axes[row, 0].set_title('ICU Patients')
            axes[row, 0].set_ylabel('Patients')
            axes[row, 0].legend()
            axes[row, 0].grid(True)
        
        # Hospital Patients
        if 'hosp_patients' in df.columns:
            for country in countries:
                country_data = df[df['location'] == country]
                if country_data['hosp_patients'].sum() > 0:
                    axes[row, 1].plot(country_data['date'], country_data['hosp_patients'], label=country)
            axes[row, 1].set_title('Hospital Patients')
            axes[row, 1].set_ylabel('Patients')
            axes[row, 1].legend()
            axes[row, 1].grid(True)
        
        row += 1
    
    plt.tight_layout()
    plt.show()
    
    # Generate summary report
    generate_report(df, countries, metrics)

# %% [markdown]
# ## 5. Report Generation

# %%
def generate_report(df, countries, metrics):
    """Generate a summary report of findings"""
    latest_data = df[df['date'] == df['date'].max()]
    
    print("\n" + "="*80)
    print("COVID-19 ANALYSIS REPORT".center(80))
    print("="*80)
    
    print(f"\nTime Period: {df['date'].min().date()} to {df['date'].max().date()}")
    print(f"Countries Analyzed: {', '.join(countries)}")
    
    # Cases Summary
    if 'cases' in metrics:
        cases_summary = latest_data.sort_values('total_cases', ascending=False)[['location', 'total_cases', 'new_cases']]
        print("\nCases Summary:")
        display(cases_summary)
    
    # Deaths Summary
    if 'deaths' in metrics:
        deaths_summary = latest_data.sort_values('total_deaths', ascending=False)[['location', 'total_deaths', 'death_rate']]
        deaths_summary['death_rate'] = deaths_summary['death_rate'] * 100
        print("\nDeaths Summary:")
        display(deaths_summary)
    
    # Vaccination Summary
    if 'vaccinations' in metrics and 'people_vaccinated' in latest_data.columns:
        vax_summary = latest_data.sort_values('people_vaccinated', ascending=False)[['location', 'people_vaccinated', 'vaccination_rate']]
        vax_summary['vaccination_rate'] = vax_summary['vaccination_rate'] * 100
        print("\nVaccination Summary:")
        display(vax_summary)
    
    # Hospitalization Summary
    if 'hospitalizations' in metrics:
        hosp_metrics = []
        if 'hosp_patients' in latest_data.columns:
            hosp_metrics.append('hosp_patients')
        if 'icu_patients' in latest_data.columns:
            hosp_metrics.append('icu_patients')
        
        if hosp_metrics:
            hosp_summary = latest_data.sort_values(hosp_metrics[0], ascending=False)[['location'] + hosp_metrics]
            print("\nHospitalization Summary:")
            display(hosp_summary)
    
    # Comparative Analysis
    print("\nComparative Analysis:")
    if 'total_cases' in latest_data.columns:
        print("- Countries with highest case loads: ", 
              latest_data.sort_values('total_cases', ascending=False)['location'].iloc[0])
    
    if 'death_rate' in latest_data.columns:
        print("- Country with highest death rate: ", 
              latest_data[latest_data['total_cases'] > 10000].sort_values('death_rate', ascending=False)['location'].iloc[0])
    
    if 'vaccination_rate' in latest_data.columns:
        print("- Country with highest vaccination rate: ", 
              latest_data.sort_values('vaccination_rate', ascending=False)['location'].iloc[0])
    
    print("\n" + "="*80)
    print("END OF REPORT".center(80))
    print("="*80)

# %% [markdown]
# ## 6. Global Map Visualization

# %%
if not covid_df.empty and 'iso_code' in covid_df.columns:
    # Prepare data for choropleth
    latest_global = covid_df[covid_df['date'] == covid_df['date'].max()].copy()
    
    # Calculate metrics if not present
    if 'total_cases_per_million' not in latest_global.columns and 'total_cases' in latest_global.columns:
        latest_global['total_cases_per_million'] = (latest_global['total_cases'] / latest_global['population']) * 1e6
    
    if 'people_vaccinated_per_hundred' not in latest_global.columns and 'people_vaccinated' in latest_global.columns:
        latest_global['people_vaccinated_per_hundred'] = (latest_global['people_vaccinated'] / latest_global['population']) * 100
    
    # Create map
    try:
        fig = px.choropleth(latest_global,
                           locations="iso_code",
                           color="total_cases_per_million",
                           hover_name="location",
                           hover_data=["total_cases", "total_deaths", "people_vaccinated"],
                           color_continuous_scale=px.colors.sequential.Plasma,
                           title="Global COVID-19 Cases per Million People")
        fig.show()
    except Exception as e:
        print(f"Could not create global map: {str(e)}")

Dataset loaded successfully from GitHub!
Shape: (429435, 67)
Date range: 2020-01-01 to 2024-08-14

First 5 rows:


Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-01-05,0.0,0.0,,0.0,0.0,,...,,37.75,0.5,64.83,0.51,41128772,,,,
1,AFG,Asia,Afghanistan,2020-01-06,0.0,0.0,,0.0,0.0,,...,,37.75,0.5,64.83,0.51,41128772,,,,
2,AFG,Asia,Afghanistan,2020-01-07,0.0,0.0,,0.0,0.0,,...,,37.75,0.5,64.83,0.51,41128772,,,,
3,AFG,Asia,Afghanistan,2020-01-08,0.0,0.0,,0.0,0.0,,...,,37.75,0.5,64.83,0.51,41128772,,,,
4,AFG,Asia,Afghanistan,2020-01-09,0.0,0.0,,0.0,0.0,,...,,37.75,0.5,64.83,0.51,41128772,,,,


Configure your analysis:


VBox(children=(HBox(children=(SelectMultiple(description='Countries:', index=(239, 102, 30, 83), layout=Layout…

Output()

Could not create global map: Mime type rendering requires nbformat>=4.2.0 but it is not installed
