<a href="https://colab.research.google.com/github/Dee-Nwanjah/Data_Science-Portfolio/blob/main/COVID_Explorer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# ==============================================================================
# SETUPS, LOADING & EXPLORATION
# ==============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime, timedelta
import requests

# Create notebook: "Day7_COVID_Explorer"
print("COVID-19 Data Explorer - Portfolio Project")
print("="*50)

# Function to download COVID data
def download_covid_data():
    """Download latest COVID-19 data from reliable sources"""

    # Primary source: Our World in Data
    url = "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv"

    try:
        print("Downloading COVID-19 data...")
        df = pd.read_csv(url)
        print(f"✅ Successfully downloaded {len(df)} records")
        return df
    except Exception as e:
        print(f"❌ Error downloading data: {e}")
        # Fallback: create sample data for demonstration
        return create_sample_covid_data()

def create_sample_covid_data():
    """Create sample COVID data if download fails"""
    print("Creating sample data for demonstration...")

    countries = ['United States', 'United Kingdom', 'Germany', 'France', 'Italy', 'Spain', 'Canada', 'Australia']
    dates = pd.date_range('2020-01-01', '2023-12-31', freq='D')

    data = []
    for country in countries:
        for date in dates:
            # Simulate realistic COVID trends
            days_since_start = (date - dates[0]).days
            base_cases = max(0, np.random.poisson(100) * (1 + 0.01 * days_since_start))

            data.append({
                'location': country,
                'date': date,
                'new_cases': max(0, base_cases + np.random.normal(0, 50)),
                'total_cases': None,  # Will calculate cumulative
                'new_deaths': max(0, np.random.poisson(2)),
                'total_deaths': None,  # Will calculate cumulative
                'population': np.random.randint(10000000, 350000000)
            })

    df = pd.DataFrame(data)

    # Calculate cumulative totals
    for country in countries:
        mask = df['location'] == country
        df.loc[mask, 'total_cases'] = df.loc[mask, 'new_cases'].cumsum()
        df.loc[mask, 'total_deaths'] = df.loc[mask, 'new_deaths'].cumsum()

    return df

# Download data
covid_df = download_covid_data()
covid_df['date'] = pd.to_datetime(covid_df['date'])

# Basic data exploration
print(f"\nDataset shape: {covid_df.shape}")
print(f"Date range: {covid_df['date'].min()} to {covid_df['date'].max()}")
print(f"Countries: {covid_df['location'].nunique()}")
print(f"Columns: {list(covid_df.columns)}")

COVID-19 Data Explorer - Portfolio Project
Downloading COVID-19 data...
✅ Successfully downloaded 429435 records

Dataset shape: (429435, 67)
Date range: 2020-01-01 00:00:00 to 2024-08-14 00:00:00
Countries: 255
Columns: ['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases', 'new_cases_smoothed', 'total_deaths', 'new_deaths', 'new_deaths_smoothed', 'total_cases_per_million', 'new_cases_per_million', 'new_cases_smoothed_per_million', 'total_deaths_per_million', 'new_deaths_per_million', 'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients', 'icu_patients_per_million', 'hosp_patients', 'hosp_patients_per_million', 'weekly_icu_admissions', 'weekly_icu_admissions_per_million', 'weekly_hosp_admissions', 'weekly_hosp_admissions_per_million', 'total_tests', 'new_tests', 'total_tests_per_thousand', 'new_tests_per_thousand', 'new_tests_smoothed', 'new_tests_smoothed_per_thousand', 'positive_rate', 'tests_per_case', 'tests_units', 'total_vaccinations', 'people

In [4]:
# ==============================================================================
# DATA PREPARATION
# ==============================================================================

# Clean and prepare the data
def clean_covid_data(df):
    """Clean and prepare COVID data for analysis"""

    # Filter for countries only (remove continents and world totals)
    exclude_locations = ['World', 'Europe', 'Asia', 'North America', 'South America', 'Africa', 'Oceania',
                        'European Union', 'High income', 'Upper middle income', 'Lower middle income', 'Low income']

    df_clean = df[~df['location'].isin(exclude_locations)].copy()

    # Fill missing values for key columns
    numeric_columns = ['new_cases', 'new_deaths', 'total_cases', 'total_deaths', 'population']
    for col in numeric_columns:
        if col in df_clean.columns:
            df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce').fillna(0)

    # Add calculated fields
    df_clean['cases_per_million'] = (df_clean['total_cases'] / df_clean['population'] * 1000000).fillna(0)
    df_clean['deaths_per_million'] = (df_clean['total_deaths'] / df_clean['population'] * 1000000).fillna(0)

    # Add time-based columns
    df_clean['year'] = df_clean['date'].dt.year
    df_clean['month'] = df_clean['date'].dt.month
    df_clean['week'] = df_clean['date'].dt.isocalendar().week

    # Calculate 7-day rolling averages
    df_clean = df_clean.sort_values(['location', 'date'])
    df_clean['new_cases_7day_avg'] = df_clean.groupby('location')['new_cases'].rolling(7, min_periods=1).mean().reset_index(0, drop=True)
    df_clean['new_deaths_7day_avg'] = df_clean.groupby('location')['new_deaths'].rolling(7, min_periods=1).mean().reset_index(0, drop=True)

    return df_clean

covid_clean = clean_covid_data(covid_df)

print("Data cleaning complete!")
print(f"Cleaned dataset shape: {covid_clean.shape}")
print(f"Top 10 countries by total cases:")
top_countries = covid_clean.groupby('location')['total_cases'].max().sort_values(ascending=False).head(10)
print(top_countries)

Data cleaning complete!
Cleaned dataset shape: (417680, 74)
Top 10 countries by total cases:
location
High-income countries            429044049.0
Upper-middle-income countries    251753518.0
European Union (27)              185822587.0
United States                    103436829.0
China                             99373219.0
Lower-middle-income countries     91954400.0
India                             45041748.0
France                            38997490.0
Germany                           38437756.0
Brazil                            37511921.0
Name: total_cases, dtype: float64


In [5]:
# ==============================================================================
# INTERACTIVE DASHBOARD
# ==============================================================================

# Create interactive visualizations using Plotly
def create_covid_dashboard(df):
    """Create comprehensive COVID-19 dashboard using Plotly Express"""

    # Global Overview
    # Top 15 countries by total cases
    top_15 = df.groupby('location')['total_cases'].max().sort_values(ascending=False).head(15).reset_index()
    fig1_bar = px.bar(top_15, x='total_cases', y='location', orientation='h', title='Top 15 Countries by Total Cases', color='location')

    # Global daily trend
    global_daily = df.groupby('date')['new_cases'].sum().reset_index()
    fig1_line = px.line(global_daily, x='date', y='new_cases', title='Global Daily New Cases Trend')

    # Cases per Million Choropleth Map (Latest Data)
    latest_data = df.loc[df.groupby('location')['date'].idxmax()].copy()
    fig1_map = px.choropleth(
        latest_data,
        locations='location',
        locationmode='country names',
        color='cases_per_million',
        hover_name='location',
        color_continuous_scale='Plasma',
        title='Global Cases per Million'
    )

    # Deaths vs Cases Correlation (Latest Data)
    fig1_scatter = px.scatter(
        latest_data,
        x='total_cases',
        y='total_deaths',
        size='population',
        color='location',
        hover_name='location',
        title='Deaths vs Cases Correlation (Latest Data)'
    )

    # Combine figures into a dashboard layout using make_subplots for better control over positioning
    fig1 = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Top 15 Countries by Total Cases', 'Global Daily New Cases Trend',
                       'Global Cases per Million', 'Deaths vs Cases Correlation'),
        specs=[[{"type": "bar"}, {"type": "scatter"}],
               [{"type": "choropleth"}, {"type": "scatter"}]]
    )

    # Add traces from Plotly Express figures to the subplot
    for data in fig1_bar.data:
        fig1.add_trace(data, row=1, col=1)
    for data in fig1_line.data:
        fig1.add_trace(data, row=1, col=2)
    for data in fig1_map.data:
        fig1.add_trace(data, row=2, col=1)
    for data in fig1_scatter.data:
        fig1.add_trace(data, row=2, col=2)


    fig1.update_layout(height=800, showlegend=False, title_text="COVID-19 Global Dashboard")

    return fig1

def create_country_comparison(df, countries=['United States', 'United Kingdom', 'Germany', 'France', 'Italy']):
    """Create country comparison charts using Plotly Express"""

    # Filter for selected countries
    country_data = df[df['location'].isin(countries)].copy()

    # Create subplots using Plotly Express for easier creation of individual charts
    fig_cases_avg = px.line(country_data, x='date', y='new_cases_7day_avg', color='location', title='Daily New Cases (7-day average)')
    fig_cases_cum = px.line(country_data, x='date', y='total_cases', color='location', title='Cumulative Cases')
    fig_deaths_avg = px.line(country_data, x='date', y='new_deaths_7day_avg', color='location', title='Daily Deaths (7-day average)')
    fig_cases_per_million = px.line(country_data, x='date', y='cases_per_million', color='location', title='Cases per Million Population')


    # Combine figures into a dashboard layout using make_subplots
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Daily New Cases (7-day average)', 'Cumulative Cases',
                       'Daily Deaths (7-day average)', 'Cases per Million Population'),
        specs=[[{"secondary_y": False}, {"secondary_y": False}],
               [{"secondary_y": False}, {"secondary_y": False}]]
    )

    # Add traces from Plotly Express figures to the subplot
    for data in fig_cases_avg.data:
        fig.add_trace(data, row=1, col=1)
    for data in fig_cases_cum.data:
        fig.add_trace(data, row=1, col=2)
    for data in fig_deaths_avg.data:
        fig.add_trace(data, row=2, col=1)
    for data in fig_cases_per_million.data:
        fig.add_trace(data, row=2, col=2)

    fig.update_layout(height=800, title_text="Country Comparison Dashboard")


    return fig

# Create dashboards
print("Creating interactive dashboards...")
dashboard1 = create_covid_dashboard(covid_clean)
dashboard2 = create_country_comparison(covid_clean)

# Display dashboards (in Jupyter notebook)
dashboard1.show()
dashboard2.show()

print("✅ Interactive dashboards created!")

Creating interactive dashboards...


✅ Interactive dashboards created!


Advanced Analytics

In [6]:
# ==============================================================================
# ADVANCED ANALYTICS
# ==============================================================================

# Add advanced analytics functions
def analyze_covid_trends(df):
    """Perform advanced trend analysis"""

    # Calculate growth rates
    df_sorted = df.sort_values(['location', 'date']).copy() # Add .copy() to avoid SettingWithCopyWarning
    df_sorted['cases_growth_rate'] = df_sorted.groupby('location')['total_cases'].pct_change() * 100
    df_sorted['deaths_growth_rate'] = df_sorted.groupby('location')['total_deaths'].pct_change() * 100

    # Identify peaks and waves
    df_sorted['cases_7day_avg'] = df_sorted.groupby('location')['new_cases'].rolling(7, min_periods=1).mean().reset_index(0, drop=True)

    # Calculate case fatality rate, handling division by zero
    df_sorted['case_fatality_rate'] = (df_sorted['total_deaths'] / df_sorted['total_cases'] * 100)
    df_sorted['case_fatality_rate'] = df_sorted['case_fatality_rate'].replace([np.inf, -np.inf], np.nan).fillna(0) # Handle potential inf and NaN

    return df_sorted

def generate_insights(df):
    """Generate automated insights from the data"""

    insights = []

    # Latest global statistics
    latest_date = df['date'].max()
    latest_data = df[df['date'] == latest_date].copy() # Add .copy()

    total_global_cases = latest_data['total_cases'].sum()
    total_global_deaths = latest_data['total_deaths'].sum()
    # Handle potential division by zero for global CFR
    global_cfr = (total_global_deaths / total_global_cases) * 100 if total_global_cases > 0 else 0

    insights.append(f"📊 As of {latest_date.strftime('%B %d, %Y')}, there have been {total_global_cases:,.0f} confirmed cases globally")
    insights.append(f"💔 Global deaths: {total_global_deaths:,.0f} (Case Fatality Rate: {global_cfr:.2f}%)")

    # Most affected countries
    top_cases = latest_data.nlargest(5, 'total_cases')['location'].tolist()
    top_deaths = latest_data.nlargest(5, 'total_deaths')['location'].tolist()

    insights.append(f"🏆 Countries with most cases: {', '.join(top_cases)}")
    insights.append(f"💀 Countries with most deaths: {', '.join(top_deaths)}")

    # Recent trends
    last_7_days = df[df['date'] >= (latest_date - timedelta(days=7))].copy() # Add .copy()
    recent_cases = last_7_days.groupby('date')['new_cases'].sum()

    if len(recent_cases) > 1:
        trend = "increasing" if recent_cases.iloc[-1] > recent_cases.iloc[0] else "decreasing"
        insights.append(f"📈 Global daily cases are {trend} over the past week")

    # Cases per million analysis
    # Ensure 'population' column exists and is numeric before calculation
    if 'population' in latest_data.columns:
        latest_data['cases_per_million'] = (latest_data['total_cases'] / latest_data['population'] * 1000000).fillna(0)
        latest_data_filtered = latest_data[latest_data['location'].isin(top_countries)].copy() # Filter for top countries
        if not latest_data_filtered.empty:
            highest_per_capita = latest_data_filtered.nlargest(3, 'cases_per_million')[['location', 'cases_per_million']]
            insights.append(f"📍 Highest cases per million among top countries: {highest_per_capita.iloc[0]['location']} ({highest_per_capita.iloc[0]['cases_per_million']:,.0f} per million)")
        else:
            insights.append("⚠️ Cannot calculate cases per million for top countries: data not available.")

    else:
        insights.append("⚠️ Cannot calculate cases per million: 'population' column not found.")


    return insights

# Perform advanced analysis
covid_advanced = analyze_covid_trends(covid_clean)
insights = generate_insights(covid_advanced)

print("=== COVID-19 DATA INSIGHTS ===")
for insight in insights:
    print(insight)

# Create advanced visualization
def create_advanced_charts(df):
    """Create advanced analytical charts"""

    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Growth Rate Trends', 'Case Fatality Rate by Country',
                       'Wave Pattern Analysis', 'Cases vs Population'),
        specs=[[{"secondary_y": False}, {"secondary_y": False}],
               [{"secondary_y": False}, {"secondary_y": False}]]
    )

    # Select top 5 countries for analysis
    # Ensure 'total_cases' is not all zeros before selecting top countries
    if df['total_cases'].sum() > 0:
        top_countries = df.groupby('location')['total_cases'].max().nlargest(5).index
    else:
        top_countries = [] # Handle case where there are no cases

    colors = px.colors.qualitative.Set1

    for i, country in enumerate(top_countries):
        country_data = df[df['location'] == country].copy() # Add .copy()

        # Growth rate trends
        fig.add_trace(
            go.Scatter(
                x=country_data['date'],
                y=country_data['cases_growth_rate'].rolling(7).mean(),
                mode='lines',
                name=country,
                line=dict(color=colors[i])
            ),
            row=1, col=1
        )

        # Case fatality rate
        if not country_data.empty: # Check if country_data is not empty
            latest_cfr = country_data['case_fatality_rate'].iloc[-1]
            fig.add_trace(
                go.Bar(
                    x=[country],
                    y=[latest_cfr],
                    name=country,
                    marker_color=colors[i],
                    showlegend=False
                ),
                row=1, col=2
            )

        # Wave analysis (7-day average)
        fig.add_trace(
            go.Scatter(
                x=country_data['date'],
                y=country_data['cases_7day_avg'],
                mode='lines',
                name=country,
                line=dict(color=colors[i]),
                showlegend=False
            ),
            row=2, col=1
        )

    # Population vs cases scatter
    latest_pop_data = df[df['date'] == df['date'].max()].copy() # Add .copy()
    # Ensure 'population' and 'total_cases' columns exist and are numeric
    if 'population' in latest_pop_data.columns and 'total_cases' in latest_pop_data.columns:
         fig.add_trace(
            go.Scatter(
                x=latest_pop_data['population'],
                y=latest_pop_data['total_cases'],
                mode='markers',
                text=latest_pop_data['location'],
                marker=dict(size=8, color=latest_pop_data['location'].astype('category').cat.codes, colorscale='Viridis', showscale=True), # Map countries to colors
                showlegend=False
            ),
            row=2, col=2
        )


    fig.update_layout(height=800, title_text="Advanced COVID-19 Analytics")

    return fig

advanced_chart = create_advanced_charts(covid_advanced)
advanced_chart.show()

print("✅ Advanced analytics complete!")

=== COVID-19 DATA INSIGHTS ===
📊 As of August 14, 2024, there have been 0 confirmed cases globally
💔 Global deaths: 0 (Case Fatality Rate: 0.00%)
🏆 Countries with most cases: European Union (27), High-income countries, Lithuania, Malaysia, Upper-middle-income countries
💀 Countries with most deaths: European Union (27), High-income countries, Lithuania, Malaysia, Upper-middle-income countries
📈 Global daily cases are decreasing over the past week
⚠️ Cannot calculate cases per million for top countries: data not available.


✅ Advanced analytics complete!
