In [None]:
#import relevant libraries
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import seaborn as sns

#load raw dataset
df = pd.read_csv('C:/Users/AMARA/OneDrive/Documents/3MTT_CAPSTONE_PROJECT_DATA_SCIENCE.csv')
df.head()

# Create an interactive table
qgrid_widget = qgrid.show_grid(df, show_toolbar=True)
qgrid_widget

df['Mortality Ratio'] = (df['Deaths'] / df['Confirmed']) * 100
# Add a new target column for classification (e.g., High vs. Low Mortality)
df['High Mortality'] = (df['Mortality Ratio'] > df['Mortality Ratio'].median()).astype(int)
# Calculate the recovery rate
df['Recovery Rate'] = (df['Recovered'] / df['Confirmed']) * 100
# Check the first few rows to ensure the column is added correctly
print(df.head())
df

# Check for missing values
print(df.isnull().sum())
# Check for duplicates
print(f"Number of duplicate rows: {df.duplicated().sum()}")

# Standardize the 'Country/Region' column (remove whitespace, ensure consistent casing)
df['Country/Region'] = df['Country/Region'].str.strip().str.title()

# Select numeric columns
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns

# Calculate daily growth rate = (new cases / active cases) * 100
df['Daily Growth Rate'] = (df['New cases'] / (df['Active'] + 1e-5)) * 100
df['1 Week Growth Rate'] = (df['1 week change'] / df['Confirmed last week']) * 100
# Check the first few rows
df.head()

# Verify new columns
print(df[['Daily Growth Rate', '1 Week Growth Rate']].describe())

# Check the first few rows
df.head()

numeric_df = df.select_dtypes(include=['number'])
correlation_matrix = numeric_df.corr()
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.show()

# Group data by region
region_df = df.groupby('WHO Region').sum()

# Bar chart for mortality and recovery rates
region_df[['Mortality Ratio', 'Recovery Rate']].plot(kind='bar')
plt.title('Mortality and Recovery Rates by WHO Region')
plt.ylabel('Rate (%)')
plt.xticks(rotation=45)
plt.show()

# Aggregate the data by WHO Region
region_df = df.groupby('WHO Region')[['Mortality Ratio', 'Recovery Rate']].mean()

# Create a line plot
plt.figure(figsize=(12, 6))
for column in ['Mortality Ratio', 'Recovery Rate']:
    plt.plot(region_df.index, region_df[column], marker='o', label=column)

# Customize the plot
plt.title('Mortality Ratio and Recovery Rate by WHO Region', fontsize=14)
plt.xlabel('WHO Region', fontsize=12)
plt.ylabel('Rate (%)', fontsize=12)
plt.legend(title="Metrics", fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.xticks(rotation=45)
plt.tight_layout()
# Show the plot
plt.show()

# Aggregate the data by WHO Region
region_df = df.groupby('WHO Region')[['Mortality Ratio', 'Recovery Rate']].mean()

# Create a line plot
plt.figure(figsize=(12, 6))
for column in ['Mortality Ratio', 'Recovery Rate']:
    plt.plot(region_df.index, region_df[column], marker='o', label=column)

# Customize the plot
plt.title('Mortality Ratio and Recovery Rate by WHO Region', fontsize=14)
plt.xlabel('WHO Region', fontsize=12)
plt.ylabel('Rate (%)', fontsize=12)
plt.legend(title="Metrics", fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.xticks(rotation=45)
plt.tight_layout()
# Aggregate the data by WHO Region
region_df = df.groupby('WHO Region')[['Mortality Ratio', 'Recovery Rate']].mean()

# Create a line plot
plt.figure(figsize=(12, 6))
for column in ['Mortality Ratio', 'Recovery Rate']:
    plt.plot(region_df.index, region_df[column], marker='o', label=column)

# Customize the plot
plt.title('Mortality Ratio and Recovery Rate by WHO Region', fontsize=14)
plt.xlabel('WHO Region', fontsize=12)
plt.ylabel('Rate (%)', fontsize=12)
plt.legend(title="Metrics", fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.xticks(rotation=45)
plt.tight_layout()
# Show the plot
plt.show()

region_df[['Deaths / 100 Cases', 'Recovered / 100 Cases']].plot(kind='bar')
plt.title('Recovery and Death Rates by WHO Region')
plt.xlabel('WHO Region')
plt.ylabel('Rate')
plt.show()

df['Deaths'] = pd.to_numeric(df['Deaths'], errors='coerce')
df['Confirmed'] = pd.to_numeric(df['Confirmed'], errors='coerce')
mortality_by_region = df.groupby('WHO Region')['Mortality Ratio'].mean()
mortality_by_region.plot(kind='bar', color='coral')
plt.title('Average Mortality Ratio by WHO Region')
plt.xlabel('WHO Region')
plt.ylabel('Mortality Ratio (%)')
plt.show()

# Highlight countries with extreme values for deaths
outliers = df[df['Deaths'] > df['Deaths'].quantile(0.95)]
print("Outlier countries with highest deaths:")
print(outliers[['Country/Region', 'Deaths', 'Confirmed']])
sns.lmplot(x='Confirmed', y='Recovered', data=df, hue='WHO Region', scatter_kws={'alpha':0.7})
plt.title('Recovery Rates vs. Confirmed Cases')
plt.show()

# Compare active cases across regions
active_cases_by_region = df.groupby('WHO Region')['Active'].sum()
active_cases_by_region.plot(kind='bar', color='skyblue')
plt.title('Total Active Cases by WHO Region')
plt.ylabel('Number of Active Cases')
plt.xticks(rotation=45)
plt.show()

# Features and target
X = df[['Confirmed', 'Deaths', 'Recovered', 'Active']]
y = df['High Mortality']
from sklearn.model_selection import train_test_split
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
from sklearn.ensemble import RandomForestClassifier

# Initialize and fit the model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
# Predictions
y_pred = rf_model.predict(X_test)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

# Detailed classification report
print(classification_report(y_test, y_pred))

# Example: Global trends over time
global_df = df.groupby('1 week change')[['Confirmed', 'Deaths', 'Recovered']].sum()

plt.plot(global_df.index, global_df['Confirmed'], label='Confirmed Cases', color='blue')
plt.plot(global_df.index, global_df['Recovered'], label='Recovered Cases', color='green')
plt.plot(global_df.index, global_df['Deaths'], label='Deaths', color='red')
plt.title('Global COVID-19 Trends Over Time')
plt.xlabel('1 week change')
plt.ylabel('Counts')
plt.legend()
plt.grid()
plt.show()

# Example: Global trends over time
global_df = df.groupby('1 week % increase')[['Confirmed', 'Deaths', 'Recovered']].sum()

plt.plot(global_df.index, global_df['Confirmed'], label='Confirmed Cases', color='blue')
plt.plot(global_df.index, global_df['Recovered'], label='Recovered Cases', color='green')
plt.plot(global_df.index, global_df['Deaths'], label='Deaths', color='red')
plt.title('Global COVID-19 Trends Over Time')
plt.xlabel('1 week % increase')
plt.ylabel('Counts')
plt.legend()
plt.grid()
plt.show()

region_df = df.groupby('WHO Region')[['Confirmed', 'Deaths', 'Recovered']].sum()

region_df.plot(kind='bar', stacked=True, figsize=(12, 6), colormap='tab10')
plt.title('Confirmed, Recovered, and Deaths by WHO Region')
plt.xlabel('WHO Region')
plt.ylabel('Counts')
plt.xticks(rotation=45)
plt.show()

# Mortality and Recovery Rates
rate_df = df.groupby('WHO Region')[['Deaths / 100 Cases', 'Recovered / 100 Cases']].mean()

rate_df.plot(kind='bar', figsize=(12, 6), color=['salmon', 'lightgreen'])
plt.title('Mortality and Recovery Rates by WHO Region')
plt.ylabel('Rate (%)')
plt.xticks(rotation=45)
plt.show()

# Filter countries with high mortality ratios (top 5%)
high_mortality_countries = df[df['Mortality Ratio'] > df['Mortality Ratio'].quantile(0.95)]

# Plot using seaborn
sns.barplot(
    x='Mortality Ratio', 
    y='Country/Region', 
    data=high_mortality_countries, 
    palette='Reds_r'
)

# Add title and axis labels
plt.title('Countries with High Mortality Ratios')
plt.xlabel('Mortality Ratio (%)')
plt.ylabel('Country/Region')
plt.tight_layout()
plt.show()

from sklearn.metrics import confusion_matrix
import numpy as np

# Confusion matrix for classification results
conf_matrix = confusion_matrix(y_test, y_pred)

# Heatmap for confusion matrix
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Low Mortality', 'High Mortality'], yticklabels=['Low Mortality', 'High Mortality'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()