In [None]:
# %% [markdown]
"""
# COVID-19 Dataset Analysis

This notebook analyzes COVID-19 case data up to April 29, 2020, including:
- Confirmed cases
- Deaths
- Recovered cases
by region/country.
"""

# %%
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Configure visualization settings
plt.style.use('ggplot')
sns.set_palette('husl')
%matplotlib inline

# %% [markdown]
"""
## Data Loading and Initial Inspection
"""
# %%
def load_covid_data(filepath):
    """Load COVID-19 data with error handling and initial checks"""
    try:
        df = pd.read_csv(filepath)
        print(f"Data loaded successfully with {df.shape[0]} records")
        
        # Basic data validation
        required_columns = ['Region', 'Confirmed', 'Deaths', 'Recovered']
        missing_cols = [col for col in required_columns if col not in df.columns]
        if missing_cols:
            raise ValueError(f"Missing required columns: {missing_cols}")
            
        return df
    except FileNotFoundError:
        print("Error: File not found. Please check the file path.")
    except Exception as e:
        print(f"An error occurred while loading the data: {str(e)}")
    return None

covid_df = load_covid_data("Covid_19_Dataset.csv")

# %% [markdown]
"""
## Data Quality Assessment
"""
# %%
def assess_data_quality(df):
    """Perform data quality checks and visualization"""
    print("\n=== Data Completeness ===")
    print("Record counts per column:")
    print(df.count())
    
    print("\n=== Missing Values ===")
    missing_values = df.isnull().sum()
    print(missing_values)
    
    # Visualize missing data
    plt.figure(figsize=(10, 6))
    sns.heatmap(df.isnull(), yticklabels=False, cbar=False, cmap='viridis')
    plt.title('Missing Values Heatmap')
    plt.show()
    
    return missing_values

missing_values = assess_data_quality(covid_df)

# %% [markdown]
"""
## Data Analysis Questions
"""
# %%
def analyze_covid_data(df):
    """Perform comprehensive COVID-19 data analysis"""
    
    # Q1: Cases by region
    print("\n=== Confirmed, Deaths and Recovered Cases by Region ===")
    regional_cases = df.groupby('Region')[['Confirmed', 'Deaths', 'Recovered']].sum()
    display(regional_cases.sort_values('Confirmed', ascending=False).head(20))
    
    # Q2: Filter low case counts
    print("\n=== Removing records with <10 confirmed cases ===")
    initial_count = df.shape[0]
    df = df[df['Confirmed'] >= 10]
    print(f"Removed {initial_count - df.shape[0]} records ({initial_count} → {df.shape[0]})")
    
    # Q3: Region with max confirmed cases
    max_confirmed = df.groupby('Region')['Confirmed'].sum().idxmax()
    max_value = df.groupby('Region')['Confirmed'].sum().max()
    print(f"\nRegion with maximum confirmed cases: {max_confirmed} ({max_value:,} cases)")
    
    # Q4: Region with min deaths
    min_deaths = df.groupby('Region')['Deaths'].sum().idxmin()
    min_value = df.groupby('Region')['Deaths'].sum().min()
    print(f"\nRegion with minimum deaths: {min_deaths} ({min_value} deaths)")
    
    # Q5: Cases for specific countries
    countries = ['India', 'US', 'Yemen']
    print("\n=== Cases for Selected Countries ===")
    for country in countries:
        country_data = df[df['Region'] == country]
        if not country_data.empty:
            confirmed = country_data['Confirmed'].sum()
            deaths = country_data['Deaths'].sum()
            recovered = country_data['Recovered'].sum()
            print(f"{country}: Confirmed={confirmed:,}, Deaths={deaths:,}, Recovered={recovered:,}")
        else:
            print(f"No data available for {country}")
    
    # Q6A: Sort by confirmed cases (ascending)
    print("\n=== Top 20 Regions by Confirmed Cases (Ascending) ===")
    display(df.sort_values('Confirmed').head(20))
    
    # Q6B: Sort by recovered cases (descending)
    print("\n=== Top 20 Regions by Recovered Cases (Descending) ===")
    display(df.sort_values('Recovered', ascending=False).head(20))
    
    return df

covid_df = analyze_covid_data(covid_df)

# %% [markdown]
"""
## Additional Visualizations
"""
# %%
def create_covid_visualizations(df):
    """Generate insightful visualizations of COVID data"""
    
    # Top 10 regions by confirmed cases
    top_regions = df.groupby('Region')['Confirmed'].sum().nlargest(10)
    plt.figure(figsize=(12, 6))
    top_regions.sort_values().plot(kind='barh', color='darkred')
    plt.title('Top 10 Regions by Confirmed COVID-19 Cases')
    plt.xlabel('Confirmed Cases')
    plt.ylabel('Region')
    plt.show()
    
    # Case fatality ratio by region (for regions with >1000 cases)
    regional_stats = df.groupby('Region')[['Confirmed', 'Deaths']].sum()
    regional_stats = regional_stats[regional_stats['Confirmed'] > 1000]
    regional_stats['CFR'] = (regional_stats['Deaths'] / regional_stats['Confirmed']) * 100
    regional_stats = regional_stats.sort_values('CFR', ascending=False)
    
    plt.figure(figsize=(12, 6))
    regional_stats['CFR'].head(10).plot(kind='bar', color='darkblue')
    plt.title('Top 10 Regions by Case Fatality Ratio (CFR) %')
    plt.ylabel('CFR (%)')
    plt.xticks(rotation=45)
    plt.show()

create_covid_visualizations(covid_df)

# %% [markdown]
"""
## Key Findings Summary
1. **Data Quality**:
   - Dataset contains [X] records
   - Missing values found in [Y] columns

2. **Regional Analysis**:
   - Highest case burden: [Region1] with [N] cases
   - Lowest deaths reported in: [Region2]
   - India's status: [Confirmed], [Deaths], [Recovered]

3. **Patterns**:
   - Recovery rates vary significantly by region
   - Case fatality ratios range from [A]% to [B]%
   - [Other significant findings]
"""