In [None]:
#!/usr/bin/env python
# coding: utf-8

# # COVID-19 Global Data Tracker
# 
# This notebook analyzes global COVID-19 trends, including cases, deaths, recoveries, and vaccinations across countries and time. We'll clean and process real-world data, perform exploratory data analysis (EDA), generate insights, and visualize trends using Python data tools.

# ## 1. Data Collection and Loading
# 
# We'll use the Our World in Data COVID-19 dataset, which is a reliable and comprehensive source for COVID-19 data.

# In[1]:


# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from datetime import datetime
import warnings

# Improve the appearance of plots
plt.style.use('seaborn-v0_8-whitegrid')
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
sns.set(style="whitegrid")


# In[2]:


# Load the dataset
# For this notebook, we assume the data file is in the current directory
# You may need to adjust the path as needed
try:
    df = pd.read_csv('covid.csv')
    print(f"Data loaded successfully! Shape: {df.shape}")
except FileNotFoundError:
    print("Dataset not found. Please download the dataset from:")
    print("https://covid.ourworldindata.org/data/owid-covid-data.csv")
    
    # For demonstration purposes, let's load the data directly from the URL
    print("Attempting to load data directly from source URL...")
    url = "https://covid.ourworldindata.org/data/owid-covid-data.csv"
    try:
        df = pd.read_csv(url)
        print(f"Data loaded from URL successfully! Shape: {df.shape}")
    except:
        print("Failed to load data from URL. Please download manually.")


# In[3]:


# Explore the dataset structure
print("First few rows of the dataset:")
df.head()


# In[4]:


# Check columns
print(f"Number of columns: {len(df.columns)}")
print("\nColumns in the dataset:")
print(df.columns.tolist())


# In[5]:


# Basic information about the dataset
df.info()


# In[6]:


# Check for missing values in key columns
missing_data = df[['date', 'location', 'total_cases', 'total_deaths', 'new_cases', 
                   'new_deaths', 'total_vaccinations']].isnull().sum()
print("Missing values in key columns:")
print(missing_data)


# In[7]:


# Check unique locations (countries and regions)
print(f"Number of unique locations: {df['location'].nunique()}")
print("\nSample of locations:")
print(df['location'].unique()[:20])  # Display first 20 locations


# In[8]:


# Check the date range
print(f"First date in the dataset: {df['date'].min()}")
print(f"Last date in the dataset: {df['date'].max()}")


# ## 2. Data Cleaning and Preparation

# In[9]:


# Convert date column to datetime
df['date'] = pd.to_datetime(df['date'])
print("Date column converted to datetime format.")

# Sort by date for time series analysis
df = df.sort_values(['location', 'date'])
print("Data sorted by location and date.")


# In[10]:


# Select countries of interest for detailed analysis
# We'll look at a mix of countries from different continents and of different sizes
countries_of_interest = ['World', 'United States', 'India', 'Brazil', 'United Kingdom', 
                         'South Africa', 'Kenya', 'Japan', 'Australia', 'Italy']

df_selected = df[df['location'].isin(countries_of_interest)]
print(f"Selected {len(countries_of_interest)} countries/regions for detailed analysis.")


# In[11]:


# Filter out rows with null values for critical columns in our selected countries
# For vaccination data, we'll handle it separately as it started later
df_cases = df_selected.dropna(subset=['total_cases'])
print(f"Data after filtering null total_cases: {df_cases.shape}")


# In[12]:


# Create a function to process data for a specific country
def prepare_country_data(country_name):
    country_data = df[df['location'] == country_name].copy()
    
    # Forward fill missing values for cases and deaths (carrying the last known value forward)
    # This is appropriate for cumulative data
    for col in ['total_cases', 'total_deaths', 'new_cases', 'new_deaths']:
        country_data[col] = country_data[col].ffill()
    
    # Calculate 7-day rolling averages for new cases and deaths to smooth the data
    country_data['new_cases_smoothed'] = country_data['new_cases'].rolling(window=7).mean()
    country_data['new_deaths_smoothed'] = country_data['new_deaths'].rolling(window=7).mean()
    
    return country_data

# Process data for each country of interest
country_data_dict = {}
for country in countries_of_interest:
    country_data_dict[country] = prepare_country_data(country)
    
print(f"Processed data for {len(country_data_dict)} countries/regions.")


# ## 3. Exploratory Data Analysis (EDA)

# ### 3.1 Global COVID-19 Trends

# In[13]:


# Extract World data
world_data = country_data_dict['World']

# Plot global COVID-19 cases and deaths over time
plt.figure(figsize=(14, 8))

# Plot total cases
ax1 = plt.subplot(2, 1, 1)
plt.plot(world_data['date'], world_data['total_cases'], 'b-', linewidth=2)
plt.title('Global COVID-19 Cases Over Time', fontsize=16)
plt.ylabel('Total Cases', fontsize=14)
plt.grid(True)
plt.ticklabel_format(style='plain', axis='y')
plt.xticks(rotation=45)

# Plot total deaths
ax2 = plt.subplot(2, 1, 2)
plt.plot(world_data['date'], world_data['total_deaths'], 'r-', linewidth=2)
plt.title('Global COVID-19 Deaths Over Time', fontsize=16)
plt.ylabel('Total Deaths', fontsize=14)
plt.grid(True)
plt.ticklabel_format(style='plain', axis='y')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()


# In[14]:


# Plot new cases and new deaths (7-day moving average)
plt.figure(figsize=(14, 8))

# Plot new cases (smoothed)
ax1 = plt.subplot(2, 1, 1)
plt.plot(world_data['date'], world_data['new_cases_smoothed'], 'b-', linewidth=2)
plt.title('Global New COVID-19 Cases (7-day Moving Average)', fontsize=16)
plt.ylabel('New Cases', fontsize=14)
plt.grid(True)
plt.ticklabel_format(style='plain', axis='y')
plt.xticks(rotation=45)

# Plot new deaths (smoothed)
ax2 = plt.subplot(2, 1, 2)
plt.plot(world_data['date'], world_data['new_deaths_smoothed'], 'r-', linewidth=2)
plt.title('Global New COVID-19 Deaths (7-day Moving Average)', fontsize=16)
plt.ylabel('New Deaths', fontsize=14)
plt.grid(True)
plt.ticklabel_format(style='plain', axis='y')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()


# ### 3.2 Compare COVID-19 Cases Across Selected Countries

# In[15]:


# Plot total cases for selected countries
plt.figure(figsize=(14, 8))

for country in countries_of_interest:
    if country != 'World':  # Exclude 'World' to focus on individual countries
        country_data = country_data_dict[country]
        plt.plot(country_data['date'], country_data['total_cases'], label=country, linewidth=2)

plt.title('Total COVID-19 Cases by Country', fontsize=16)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Total Cases', fontsize=14)
plt.grid(True)
plt.legend(fontsize=12)
plt.xticks(rotation=45)
plt.ticklabel_format(style='plain', axis='y')
plt.tight_layout()
plt.show()


# In[16]:


# Compare cases per million population (better for comparing countries of different sizes)
plt.figure(figsize=(14, 8))

for country in countries_of_interest:
    if country != 'World':  # Exclude 'World'
        country_data = country_data_dict[country]
        plt.plot(country_data['date'], country_data['total_cases_per_million'], label=country, linewidth=2)

plt.title('COVID-19 Cases per Million Population by Country', fontsize=16)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Cases per Million', fontsize=14)
plt.grid(True)
plt.legend(fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


# ### 3.3 Compare COVID-19 Deaths Across Selected Countries

# In[17]:


# Plot total deaths for selected countries
plt.figure(figsize=(14, 8))

for country in countries_of_interest:
    if country != 'World':  # Exclude 'World'
        country_data = country_data_dict[country]
        plt.plot(country_data['date'], country_data['total_deaths'], label=country, linewidth=2)

plt.title('Total COVID-19 Deaths by Country', fontsize=16)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Total Deaths', fontsize=14)
plt.grid(True)
plt.legend(fontsize=12)
plt.xticks(rotation=45)
plt.ticklabel_format(style='plain', axis='y')
plt.tight_layout()
plt.show()


# In[18]:


# Compare deaths per million population
plt.figure(figsize=(14, 8))

for country in countries_of_interest:
    if country != 'World':  # Exclude 'World'
        country_data = country_data_dict[country]
        plt.plot(country_data['date'], country_data['total_deaths_per_million'], label=country, linewidth=2)

plt.title('COVID-19 Deaths per Million Population by Country', fontsize=16)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Deaths per Million', fontsize=14)
plt.grid(True)
plt.legend(fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


# ### 3.4 Calculate and Plot Death Rates

# In[19]:


# Calculate death rate (total_deaths / total_cases) for each country
for country in countries_of_interest:
    country_data = country_data_dict[country]
    country_data['death_rate'] = (country_data['total_deaths'] / country_data['total_cases']) * 100

# Plot death rates for selected countries
plt.figure(figsize=(14, 8))

for country in countries_of_interest:
    country_data = country_data_dict[country]
    plt.plot(country_data['date'], country_data['death_rate'], label=country, linewidth=2)

plt.title('COVID-19 Death Rate by Country (Deaths as % of Cases)', fontsize=16)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Death Rate (%)', fontsize=14)
plt.grid(True)
plt.legend(fontsize=12)
plt.xticks(rotation=45)
plt.ylim(bottom=0)  # Start y-axis from 0
plt.tight_layout()
plt.show()


# ### 3.5 Daily New Cases Comparison

# In[20]:


# Plot daily new cases (7-day moving average) for selected countries
plt.figure(figsize=(14, 8))

for country in countries_of_interest:
    if country != 'World':  # Exclude 'World'
        country_data = country_data_dict[country]
        plt.plot(country_data['date'], country_data['new_cases_smoothed'], label=country, linewidth=2)

plt.title('Daily New COVID-19 Cases by Country (7-day Moving Average)', fontsize=16)
plt.xlabel('Date', fontsize=14)
plt.ylabel('New Cases', fontsize=14)
plt.grid(True)
plt.legend(fontsize=12)
plt.xticks(rotation=45)
plt.ticklabel_format(style='plain', axis='y')
plt.tight_layout()
plt.show()


# In[21]:


# Plot daily new cases per million (7-day moving average) for better comparison
plt.figure(figsize=(14, 8))

for country in countries_of_interest:
    if country != 'World':  # Exclude 'World'
        country_data = country_data_dict[country]
        plt.plot(country_data['date'], country_data['new_cases_smoothed_per_million'], label=country, linewidth=2)

plt.title('Daily New COVID-19 Cases per Million by Country (7-day Moving Average)', fontsize=16)
plt.xlabel('Date', fontsize=14)
plt.ylabel('New Cases per Million', fontsize=14)
plt.grid(True)
plt.legend(fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


# ## 4. Vaccination Analysis

# In[22]:


# Explore vaccination data
# First, let's check how many countries have vaccination data
vaccination_data = df.dropna(subset=['total_vaccinations'])
print(f"Number of countries with vaccination data: {vaccination_data['location'].nunique()}")

# Get the earliest vaccination date
earliest_vax_date = vaccination_data['date'].min()
print(f"Earliest vaccination date in the dataset: {earliest_vax_date}")


# In[23]:


# Filter vaccination data for our countries of interest
vax_countries = []
for country in countries_of_interest:
    if country_data_dict[country]['total_vaccinations'].notna().any():
        vax_countries.append(country)

print(f"Countries with vaccination data: {vax_countries}")


# In[24]:


# Plot total vaccinations for selected countries
plt.figure(figsize=(14, 8))

for country in vax_countries:
    country_data = country_data_dict[country]
    # Filter out null values for plotting
    valid_data = country_data.dropna(subset=['total_vaccinations'])
    if len(valid_data) > 0:  # Only plot if we have data
        plt.plot(valid_data['date'], valid_data['total_vaccinations'], label=country, linewidth=2)

plt.title('Total COVID-19 Vaccinations by Country', fontsize=16)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Total Vaccinations', fontsize=14)
plt.grid(True)
plt.legend(fontsize=12)
plt.xticks(rotation=45)
plt.ticklabel_format(style='plain', axis='y')
plt.tight_layout()
plt.show()


# In[25]:


# Plot people fully vaccinated per hundred (vaccination rate)
plt.figure(figsize=(14, 8))

for country in vax_countries:
    if country != 'World':  # Exclude 'World'
        country_data = country_data_dict[country]
        # Filter out null values for plotting
        valid_data = country_data.dropna(subset=['people_fully_vaccinated_per_hundred'])
        if len(valid_data) > 0:  # Only plot if we have data
            plt.plot(valid_data['date'], valid_data['people_fully_vaccinated_per_hundred'], 
                     label=country, linewidth=2)

plt.title('People Fully Vaccinated per 100 Population by Country', fontsize=16)
plt.xlabel('Date', fontsize=14)
plt.ylabel('People Fully Vaccinated per 100', fontsize=14)
plt.grid(True)
plt.legend(fontsize=12)
plt.xticks(rotation=45)
plt.ylim(0, 100)  # Set y-axis from 0 to 100 (percent)
plt.tight_layout()
plt.show()


# ## 5. Analysis of Latest Data

# In[26]:


# Get the latest data for each country
latest_data = df.sort_values('date').groupby('location').last().reset_index()

# Filter for countries (excluding continents, world, income groups, etc.)
is_country = ~latest_data['iso_code'].str.contains('OWID_', na=False)
latest_countries = latest_data[is_country].copy()

# Calculate top countries by total cases
top_cases = latest_countries.sort_values('total_cases', ascending=False).head(10)

# Plot top 10 countries by total cases
plt.figure(figsize=(14, 8))
sns.barplot(x='total_cases', y='location', data=top_cases, palette='viridis')
plt.title('Top 10 Countries by Total COVID-19 Cases', fontsize=16)
plt.xlabel('Total Cases', fontsize=14)
plt.ylabel('Country', fontsize=14)
plt.ticklabel_format(style='plain', axis='x')
plt.grid(True, axis='x')
plt.tight_layout()
plt.show()


# In[27]:


# Calculate top countries by total deaths
top_deaths = latest_countries.sort_values('total_deaths', ascending=False).head(10)

# Plot top 10 countries by total deaths
plt.figure(figsize=(14, 8))
sns.barplot(x='total_deaths', y='location', data=top_deaths, palette='rocket')
plt.title('Top 10 Countries by Total COVID-19 Deaths', fontsize=16)
plt.xlabel('Total Deaths', fontsize=14)
plt.ylabel('Country', fontsize=14)
plt.ticklabel_format(style='plain', axis='x')
plt.grid(True, axis='x')
plt.tight_layout()
plt.show()


# In[28]:


# Calculate top countries by cases per million
top_cases_per_million = latest_countries.dropna(subset=['total_cases_per_million'])
top_cases_per_million = top_cases_per_million.sort_values('total_cases_per_million', ascending=False).head(10)

# Plot top 10 countries by cases per million
plt.figure(figsize=(14, 8))
sns.barplot(x='total_cases_per_million', y='location', data=top_cases_per_million, palette='viridis')
plt.title('Top 10 Countries by COVID-19 Cases per Million Population', fontsize=16)
plt.xlabel('Cases per Million', fontsize=14)
plt.ylabel('Country', fontsize=14)
plt.grid(True, axis='x')
plt.tight_layout()
plt.show()


# In[29]:


# Calculate top countries by deaths per million
top_deaths_per_million = latest_countries.dropna(subset=['total_deaths_per_million'])
top_deaths_per_million = top_deaths_per_million.sort_values('total_deaths_per_million', ascending=False).head(10)

# Plot top 10 countries by deaths per million
plt.figure(figsize=(14, 8))
sns.barplot(x='total_deaths_per_million', y='location', data=top_deaths_per_million, palette='rocket')
plt.title('Top 10 Countries by COVID-19 Deaths per Million Population', fontsize=16)
plt.xlabel('Deaths per Million', fontsize=14)
plt.ylabel('Country', fontsize=14)
plt.grid(True, axis='x')
plt.tight_layout()
plt.show()


# ## 6. Creating a Choropleth Map for Global COVID-19 Cases

# In[30]:


# Prepare data for the choropleth map
map_data = latest_countries[['iso_code', 'location', 'total_cases', 'total_cases_per_million']].copy()

# Filter out rows with missing values
map_data = map_data.dropna(subset=['total_cases', 'iso_code'])

# Create choropleth map using plotly
fig = px.choropleth(
    map_data,
    locations="iso_code",
    color="total_cases",
    hover_name="location",
    color_continuous_scale=px.colors.sequential.Plasma,
    title="Global COVID-19 Total Cases by Country",
    labels={'total_cases': 'Total COVID-19 Cases'}
)

fig.update_layout(
    autosize=True,
    margin=dict(l=0, r=0, t=50, b=0),
    coloraxis_colorbar=dict(title="Total Cases")
)

fig.show()


# In[31]:


# Create a choropleth map for cases per million (better for comparison)
fig = px.choropleth(
    map_data.dropna(subset=['total_cases_per_million']),
    locations="iso_code",
    color="total_cases_per_million",
    hover_name="location",
    color_continuous_scale=px.colors.sequential.Viridis,
    title="Global COVID-19 Cases per Million Population by Country",
    labels={'total_cases_per_million': 'Cases per Million'}
)

fig.update_layout(
    autosize=True,
    margin=dict(l=0, r=0, t=50, b=0),
    coloraxis_colorbar=dict(title="Cases per Million")
)

fig.show()


# ## 7. Vaccination Progress Choropleth Map

# In[32]:


# Prepare data for the vaccination choropleth map
vax_map_data = latest_countries[['iso_code', 'location', 'people_fully_vaccinated_per_hundred']].copy()

# Filter out rows with missing values
vax_map_data = vax_map_data.dropna(subset=['people_fully_vaccinated_per_hundred', 'iso_code'])

# Create choropleth map using plotly
fig = px.choropleth(
    vax_map_data,
    locations="iso_code",
    color="people_fully_vaccinated_per_hundred",
    hover_name="location",
    color_continuous_scale=px.colors.sequential.Greens,
    title="Global COVID-19 Vaccination Progress (% Fully Vaccinated)",
    labels={'people_fully_vaccinated_per_hundred': '% Fully Vaccinated'}
)

fig.update_layout(
    autosize=True,
    margin=dict(l=0, r=0, t=50, b=0),
    coloraxis_colorbar=dict(title="% Fully Vaccinated")
)

fig.show()


# ## 8. Correlation Analysis

# In[33]:


# Select relevant columns for correlation analysis
correlation_columns = ['total_cases_per_million', 'total_deaths_per_million', 
                       'population_density', 'median_age', 'gdp_per_capita',
                       'cardiovasc_death_rate', 'diabetes_prevalence',
                       'life_expectancy', 'human_development_index']

# Filter out rows with missing values
correlation_data = latest_countries[correlation_columns].dropna()

# Calculate correlation matrix
correlation_matrix = correlation_data.corr()

# Plot correlation heatmap
plt.figure(figsize=(14, 12))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0, 
            linewidths=0.5, fmt='.2f')
plt.title('Correlation Between COVID-19 Metrics and Country Characteristics', fontsize=16)
plt.tight_layout()
plt.show()


# ## 9. Key Insights and Conclusions

# Based on our analysis of the COVID-19 global data, here are the key insights:
# 
# 1. **Global Impact**: The COVID-19 pandemic has spread globally, with significant variations in impact across different countries. Total cases and deaths continue to increase, though at varying rates across regions.
# 
# 2. **Regional Variations**: When normalizing for population size (cases/deaths per million), we observe that the pandemic's impact is not strictly proportional to country size. Some smaller countries have been hit harder per capita.
# 
# 3. **Vaccination Progress**: Vaccination rollouts have varied significantly across countries, with some nations achieving high vaccination rates while others lag behind. There is a noticeable disparity between high-income and low-income countries.
# 
# 4. **Death Rates**: The case fatality rate (deaths as a percentage of confirmed cases) has generally declined over time, likely due to improved treatments, protocols, and possibly changes in virus variants.
# 
# 5. **Correlation with Demographics**: Our analysis shows correlations between COVID-19 severity and factors such as median age, healthcare access (approximated by HDI), and prevalence of underlying conditions like cardiovascular disease and diabetes.
# 
# 6. **Waves of Infection**: The pandemic has occurred in distinct waves in most countries, though the timing and intensity of these waves have varied. This suggests complex interactions between virus variants, public health measures, and seasonal factors.
# 
# 7. **Data Limitations**: It's important to note that testing rates and reporting standards vary widely between countries, which affects case counts. Death counts are generally considered more reliable but still subject to variation in reporting methodologies.

# ## 10. Recommendations
# 
# Based on the insights from our analysis, we recommend:
# 
# 1. **Continued Vaccination Efforts**: Focus on increasing vaccination coverage, particularly in regions with low vaccination rates.
# 
# 2. **Targeted Interventions**: Implement public health strategies that account for country-specific factors identified in the correlation analysis.
# 
# 3. **Enhanced Data Collection**: Improve standardization of data collection and reporting across countries for better comparability.
# 
# 4. **Focus on Vulnerable Populations**: Direct resources to protect populations with higher risk factors, such as elderly populations or those with prevalent underlying conditions.
# 
# 5. **Monitoring and Preparedness**: Maintain vigilance for new variants and be prepared for potential future waves of infection.

# ## 11. Future Work
# 
# To expand this analysis, future work could include:
# 
# 1. **Variant Analysis**: Incorporate data on virus variants and their correlation with case/death rates.
# 
# 2. **Policy Impact Assessment**: Analyze the effectiveness of different public health interventions across countries.
# 
# 3. **Long-term Effects**: Track and analyze long COVID and other long-term health impacts of the pandemic.
# 
# 4. **Economic Impact Correlation**: Correlate pandemic metrics with economic indicators to understand broader societal impacts.
# 
# 5. **Healthcare System Resilience**: Analyze how healthcare system capacity influenced pandemic outcomes.