In [2]:
import pandas as pd
import numpy as np

# Define the file path for the uploaded CSV
file_path = 'covid_project_dataset.csv'

# --- Initial Data Loading and Preparation (necessary for Section 5 to run independently) ---
try:
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully.")
    print(f"Initial shape: {df.shape}")
except FileNotFoundError:
    print(f"Error: '{file_path}' not found. Please ensure the CSV file is in the correct directory.")
    exit() # Exit if file not found

# Rename columns for consistency with project outline
# Based on inspection of 'covid_project_dataset.csv', column names are:
# 'date', 'country', 'total_cases', 'total_deaths', 'total_recoveries', 'population'
column_mapping = {
    'country': 'Country/Region',
    'date': 'Date',
    'total_cases': 'Confirmed',
    'total_deaths': 'Deaths',
    'total_recoveries': 'Recovered',
    'population': 'Population'
}

# Apply renaming only if the column exists in the DataFrame
df.rename(columns={k: v for k, v in column_mapping.items() if k in df.columns}, inplace=True)

# Ensure 'Date' column is datetime
if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    df.dropna(subset=['Date'], inplace=True) # Drop rows where Date conversion failed
    print("Converted 'Date' column to datetime.")
else:
    print("Warning: 'Date' column not found or could not be converted. Critical for time-series analysis.")

# Handle missing numerical values for key columns by filling with 0
for col in ['Confirmed', 'Deaths', 'Recovered', 'Population']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
    else:
        # If a required column is truly missing, create it and fill with 0
        if col in ['Confirmed', 'Deaths', 'Recovered', 'Population']:
            df[col] = 0.0 # Use float to be consistent with other numeric data
            print(f"Note: Column '{col}' not found after renaming, created and filled with 0.0.")


# Drop duplicates (considering Date and Country/Region as primary keys)
if 'Date' in df.columns and 'Country/Region' in df.columns:
    initial_rows = df.shape[0]
    df.drop_duplicates(subset=['Date', 'Country/Region'], inplace=True)
    rows_after_duplicates = df.shape[0]
    if initial_rows > rows_after_duplicates:
        print(f"Removed {initial_rows - rows_after_duplicates} duplicate rows based on Date and Country/Region.")
    else:
        print("No duplicate rows found based on 'Date' and 'Country/Region'.")
else:
    print("Cannot drop duplicates based on 'Date' and 'Country/Region' as one or both are missing after renaming.")


# Filter DataFrame to include only necessary columns for Section 5 analysis
final_cols_for_section5 = ['Date', 'Country/Region', 'Confirmed', 'Deaths', 'Recovered', 'Population']
df = df[[col for col in final_cols_for_section5 if col in df.columns]]

print(f"\nDataFrame shape after initial preparation for Section 5: {df.shape}")
print("\nPrepared DataFrame Head (for Section 5):")
print(df.head())

# --- Section 5: Performance Analysis and Summary ---
print("\n--- Starting Section 5: Performance Analysis and Summary ---")

# Check if essential columns are present after all preparation steps
if not df.empty and all(col in df.columns for col in ['Country/Region', 'Confirmed', 'Recovered']):
    # Ensure data is sorted by Country/Region and Date for correct 'latest' retrieval
    df = df.sort_values(by=['Country/Region', 'Date'])

    # Get the latest data for each country based on the maximum date
    df_latest_country = df.groupby('Country/Region').last().reset_index()

    # Calculate Recovery Rate (handle potential division by zero or NaN results)
    # Recovery Rate = (Recovered / Confirmed) * 100
    df_latest_country['Recovery_Rate'] = (df_latest_country['Recovered'] / df_latest_country['Confirmed']) * 100

    # Replace infinite values (from division by zero if Confirmed was 0) and NaNs with 0
    df_latest_country.replace([np.inf, -np.inf], np.nan, inplace=True)
    df_latest_country['Recovery_Rate'].fillna(0, inplace=True)

    # Sort by recovery rate to find highest and lowest
    df_latest_country_sorted = df_latest_country.sort_values(by='Recovery_Rate', ascending=False)

    # Analyze which countries have the highest recovery rates.
    print("\n--- Countries with Highest Recovery Rates (Top 10) ---")
    # Filter for countries with actual confirmed cases to make rates meaningful
    top_10_highest_recovery = df_latest_country_sorted[df_latest_country_sorted['Confirmed'] > 0].head(10)
    print(top_10_highest_recovery[['Country/Region', 'Confirmed', 'Recovered', 'Recovery_Rate']])

    # Analyze which countries have the lowest recovery rates.
    print("\n--- Countries with Lowest Recovery Rates (Bottom 10, excluding 0 confirmed cases and 100% recovery) ---")
    bottom_10_lowest_recovery = df_latest_country_sorted[
        (df_latest_country_sorted['Confirmed'] > 0) &
        (df_latest_country_sorted['Recovery_Rate'] < 100) # Exclude 100% recovery for a more granular 'lowest'
    ].tail(10)
    print(bottom_10_lowest_recovery[['Country/Region', 'Confirmed', 'Recovered', 'Recovery_Rate']])

    # --- Project Summary Placeholder ---
    print("\n--- Project Summary ---")
    print("Based on the comprehensive data analysis of the COVID-19 dataset:")
    print("- The overall trends for confirmed cases, deaths, and recoveries have been visualized over time.")
    print("- The daily new cases and growth rates provided insights into the pandemic's progression.")
    print("- The statistical analysis revealed key metrics such as overall death rates and correlations.")
    print(f"- Countries with the highest recovery rates indicate effective management or demographic factors.")
    print(f"- Countries with lower recovery rates may warrant further investigation into healthcare capacity or other contributing factors.")
    print("Further work could involve deeper time-series analysis, forecasting, and more detailed demographic correlations.")

else:
    print("DataFrame is empty or missing essential columns ('Country/Region', 'Confirmed', 'Recovered') after preparation. Cannot perform Section 5 analysis.")

Dataset loaded successfully.
Initial shape: (411804, 10)
Converted 'Date' column to datetime.
No duplicate rows found based on 'Date' and 'Country/Region'.

DataFrame shape after initial preparation for Section 5: (411804, 6)

Prepared DataFrame Head (for Section 5):
        Date Country/Region  Confirmed  Deaths  Recovered  Population
0 2020-01-05    Afghanistan        0.0     0.0          0    41128772
1 2020-01-06    Afghanistan        0.0     0.0          0    41128772
2 2020-01-07    Afghanistan        0.0     0.0          0    41128772
3 2020-01-08    Afghanistan        0.0     0.0          0    41128772
4 2020-01-09    Afghanistan        0.0     0.0          0    41128772

--- Starting Section 5: Performance Analysis and Summary ---

--- Countries with Highest Recovery Rates (Top 10) ---
               Country/Region    Confirmed  Recovered  Recovery_Rate
0                 Afghanistan     235214.0          0            0.0
154             New Caledonia      80163.0          0   

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_latest_country['Recovery_Rate'].fillna(0, inplace=True)
