In [20]:
import pandas as pd
import numpy as np

# Define the file path for the uploaded CSV
file_path = 'covid_project_dataset.csv'

# --- Section 1: Data Collection and Cleaning (adapted for the new dataset) ---
try:
    df = pd.read_csv(file_path)
    print(f"Dataset '{file_path}' loaded successfully.")
    print(f"Initial shape: {df.shape}")
except FileNotFoundError:
    print(f"Error: '{file_path}' not found. Please ensure the CSV file is in the correct directory.")
    # In a real notebook, you might stop execution or provide an example DataFrame
    df = pd.DataFrame() # Create an empty DataFrame to prevent errors later

if not df.empty:
    # Display basic info and head for inspection
    print("\nInitial DataFrame Info:")
    df.info()
    print("\nInitial DataFrame Head:")
    print(df.head())

    # Standardize column names (adjust these based on actual column names in your CSV)
    # The 'covid_project_dataset.csv' appears to have 'date', 'country', 'total_cases', 'total_deaths', 'total_recoveries'
    column_mapping = {
        'date': 'Date',
        'country': 'Country/Region',
        'total_cases': 'Confirmed',
        'total_deaths': 'Deaths',
        'total_recoveries': 'Recovered'
    }

    # Apply renaming only if the column exists
    renamed_columns = {}
    for old_name, new_name in column_mapping.items():
        if old_name in df.columns and old_name != new_name:
            renamed_columns[old_name] = new_name
    df.rename(columns=renamed_columns, inplace=True)
    if renamed_columns:
        print(f"\nColumns renamed: {renamed_columns}")

    # Ensure 'Date' column is datetime
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
        df.dropna(subset=['Date'], inplace=True) # Drop rows where Date conversion failed
        print("Converted 'Date' column to datetime.")
    else:
        print("Warning: 'Date' column not found. Date-based analysis might be limited.")


    # Handle missing numerical values (Confirmed, Deaths, Recovered) by filling with 0.
    # It's crucial for cumulative counts.
    for col in ['Confirmed', 'Deaths', 'Recovered']:
        if col not in df.columns:
            df[col] = 0.0 # Add column and fill with 0 if missing
            print(f"'{col}' column was missing and added with 0.0 values.")
        else:
            # Convert to numeric first, then fill NaNs
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
            print(f"Missing values in '{col}' handled (filled with 0).")

    # Drop duplicates based on 'Date' and 'Country/Region' to ensure unique daily records
    initial_rows_count = df.shape[0]
    if 'Date' in df.columns and 'Country/Region' in df.columns:
        df.drop_duplicates(subset=['Date', 'Country/Region'], inplace=True)
        rows_after_duplicates = df.shape[0]
        if initial_rows_count > rows_after_duplicates:
            print(f"Removed {initial_rows_count - rows_after_duplicates} duplicate rows based on 'Date' and 'Country/Region'.")
        else:
            print("No duplicate rows found based on 'Date' and 'Country/Region'.")
    else:
        print("Warning: Skipping duplicate removal by Date and Country/Region as one or both columns are missing.")
        df.drop_duplicates(inplace=True) # Fallback to general duplicate removal
        if initial_rows_count > df.shape[0]:
            print(f"Removed {initial_rows_count - df.shape[0]} duplicate rows (general).")

    # Keep only the essential columns needed for Section 2 processing
    # Add any other columns you might want for later analysis (e.g., 'population')
    final_cols_to_keep = ['Date', 'Country/Region', 'Confirmed', 'Deaths', 'Recovered']
    # Filter for columns that actually exist in the DataFrame after renaming and adding
    df = df[[col for col in final_cols_to_keep if col in df.columns]]

    print(f"\nDataFrame shape after initial cleaning: {df.shape}")
    print("\nCleaned DataFrame Head (pre-Section 2 processing):")
    print(df.head())

    # --- Section 2: Data Processing and Analysis ---
    print("\n--- Starting Section 2: Data Processing and Analysis ---")

    # Ensure necessary columns exist for the core analysis steps in Section 2
    required_cols_for_analysis = ['Confirmed', 'Deaths', 'Country/Region', 'Date']
    if not all(col in df.columns for col in required_cols_for_analysis):
        missing_cols = [col for col in required_cols_for_analysis if col not in df.columns]
        print(f"Error: Missing one or more required columns for core analysis ({missing_cols}). Cannot proceed with Section 2 fully.")
        # Attempt to proceed with available columns, but warn the user.
    else:
        # Sort data for correct daily differences
        df = df.sort_values(by=['Country/Region', 'Date'])

        # 1. Use NumPy/Pandas to perform statistical analysis
        print("\n--- Statistical Analysis (NumPy/Pandas) ---")

        # Overall Death Rate (using total sums)
        total_confirmed_overall = df['Confirmed'].sum()
        total_deaths_overall = df['Deaths'].sum()

        if total_confirmed_overall > 0:
            overall_death_rate = (total_deaths_overall / total_confirmed_overall) * 100
            print(f"Overall Death Rate (across all data points): {overall_death_rate:.2f}%")
        else:
            print("No confirmed cases to calculate overall death rate.")

        # Mean, Median, Standard Deviation of Confirmed Cases (for non-zero cases to be meaningful)
        df_cases_nonzero = df[df['Confirmed'] > 0]
        if not df_cases_nonzero.empty:
            print(f"Mean Confirmed Cases (non-zero): {np.mean(df_cases_nonzero['Confirmed']):.2f}")
            print(f"Median Confirmed Cases (non-zero): {np.median(df_cases_nonzero['Confirmed']):.2f}")
            print(f"Std Dev Confirmed Cases (non-zero): {np.std(df_cases_nonzero['Confirmed']):.2f}")
        else:
            print("No non-zero confirmed cases found for statistical analysis.")

        # Correlation between Confirmed and Deaths (using non-zero cases)
        if 'Confirmed' in df_cases_nonzero.columns and 'Deaths' in df_cases_nonzero.columns:
            correlation = df_cases_nonzero['Confirmed'].corr(df_cases_nonzero['Deaths'])
            print(f"Correlation between Confirmed and Deaths (non-zero cases): {correlation:.4f}")
        else:
            print("Not enough non-zero confirmed/deaths data to calculate correlation.")


        # 2. Group data by country and calculate daily/weekly growth rates.
        print("\n--- Growth Rate Calculation ---")

        # Calculate daily new cases and deaths (using .diff() as the safest general method)
        df['Daily_New_Confirmed'] = df.groupby('Country/Region')['Confirmed'].diff().fillna(0)
        df['Daily_New_Deaths'] = df.groupby('Country/Region')['Deaths'].diff().fillna(0)
        df['Daily_New_Recovered'] = df.groupby('Country/Region')['Recovered'].diff().fillna(0)

        # Ensure Daily_New values are non-negative, as diff() can produce negative if data corrections occur
        df['Daily_New_Confirmed'] = df['Daily_New_Confirmed'].clip(lower=0)
        df['Daily_New_Deaths'] = df['Daily_New_Deaths'].clip(lower=0)
        df['Daily_New_Recovered'] = df['Daily_New_Recovered'].clip(lower=0)

        # Growth Rate of Confirmed Cases (New cases / Previous day's total cases) * 100
        # Need to be careful with previous day's total cases being zero.
        df['Previous_Confirmed'] = df.groupby('Country/Region')['Confirmed'].shift(1).fillna(0)
        df['Growth_Rate_Confirmed'] = df.apply(lambda row:
                                               (row['Daily_New_Confirmed'] / row['Previous_Confirmed']) * 100
                                               if row['Previous_Confirmed'] > 0 else 0, axis=1)
        # Handle inf/-inf results from division by zero or large swings by converting to NaN then 0
        df.replace([np.inf, -np.inf], np.nan, inplace=True)
        df['Growth_Rate_Confirmed'].fillna(0, inplace=True)


        print("\nDaily new cases and growth rate calculated (head for example):")
        print(df[['Date', 'Country/Region', 'Confirmed', 'Daily_New_Confirmed', 'Growth_Rate_Confirmed']].head())

        # 3. Perform data aggregation using groupby() and pivot_table() methods in Pandas.
        print("\n--- Data Aggregation (groupby/pivot_table) ---")

        # Total cases per country (using the maximum recorded values for each country)
        # This gives the latest cumulative figures
        country_summary = df.groupby('Country/Region').agg(
            Latest_Confirmed=('Confirmed', 'max'),
            Latest_Deaths=('Deaths', 'max'),
            Latest_Recovered=('Recovered', 'max')
        ).reset_index()

        # Clean up any potential inf/NaN from aggregations
        country_summary.replace([np.inf, -np.inf], np.nan, inplace=True)
        country_summary.fillna(0, inplace=True)

        print("\nTotal cases per country (latest data, head):")
        print(country_summary.head())

        # Pivot table for confirmed cases over time by country
        # Ensure only one entry per country per date for accurate pivoting.
        df_for_pivot = df.drop_duplicates(subset=['Date', 'Country/Region'], keep='last')
        # Select a subset of countries for the pivot table preview if there are too many
        unique_countries = df_for_pivot['Country/Region'].unique()
        # Take first 5 for preview, or fewer if less than 5 unique
        countries_for_pivot_preview = unique_countries[:min(5, len(unique_countries))]

        # Filter the DataFrame to only include these countries for a manageable pivot table preview
        filtered_df_for_pivot = df_for_pivot[df_for_pivot['Country/Region'].isin(countries_for_pivot_preview)]

        # Check if there's enough data to pivot
        if not filtered_df_for_pivot.empty:
            confirmed_pivot = filtered_df_for_pivot.pivot_table(index='Date', columns='Country/Region', values='Confirmed').fillna(0)
            print("\nPivot table of Confirmed Cases by Date and (Sample) Country (first 5 rows, first 5 countries):")
            print(confirmed_pivot.iloc[:5, :5]) # Print first 5 rows and first 5 columns for brevity
        else:
            print("\nNot enough data to create a meaningful pivot table preview for selected countries.")

else:
    print("DataFrame is empty after initial loading/cleaning. Cannot perform data processing and analysis.")

Dataset 'covid_project_dataset.csv' loaded successfully.
Initial shape: (411804, 10)

Initial DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 411804 entries, 0 to 411803
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   date              411804 non-null  object 
 1   country           411804 non-null  object 
 2   total_cases       411804 non-null  float64
 3   new_cases         411804 non-null  float64
 4   total_deaths      411804 non-null  float64
 5   new_deaths        411804 non-null  float64
 6   total_recoveries  411804 non-null  int64  
 7   new_recoveries    411804 non-null  int64  
 8   active_cases      411804 non-null  float64
 9   population        411804 non-null  int64  
dtypes: float64(5), int64(3), object(2)
memory usage: 31.4+ MB

Initial DataFrame Head:
         date      country  total_cases  new_cases  total_deaths  new_deaths  \
0  2020-01-05  Afghanistan         

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Growth_Rate_Confirmed'].fillna(0, inplace=True)



Daily new cases and growth rate calculated (head for example):
        Date Country/Region  Confirmed  Daily_New_Confirmed  \
0 2020-01-05    Afghanistan        0.0                  0.0   
1 2020-01-06    Afghanistan        0.0                  0.0   
2 2020-01-07    Afghanistan        0.0                  0.0   
3 2020-01-08    Afghanistan        0.0                  0.0   
4 2020-01-09    Afghanistan        0.0                  0.0   

   Growth_Rate_Confirmed  
0                    0.0  
1                    0.0  
2                    0.0  
3                    0.0  
4                    0.0  

--- Data Aggregation (groupby/pivot_table) ---

Total cases per country (latest data, head):
   Country/Region  Latest_Confirmed  Latest_Deaths  Latest_Recovered
0     Afghanistan          235214.0         7998.0                 0
1          Africa        13145380.0       259117.0                 0
2         Albania          335047.0         3605.0                 0
3         Algeria        