**Step-1: Dataset Collection**

Dataset link: https://www.kaggle.com/datasets/imdevskp/corona-virus-report

**Reason for selection:**

This dataset contains over 49,000 time-stamped records for multiple countries with confirmed, deaths, recovered, active cases and geo-coordinates (lat/long), which makes it suitable for a binary classification.

These attributes make it suitable for a binary classification project that predicts whether a record represents an infected or non-infected case.


In [None]:
from google.colab import files
files.upload()

**Step-2: Data Understanding & EDA**

2.1- Unzip and Load the Dataset

In [None]:
import zipfile
import pandas as pd
with zipfile.ZipFile('Covid19.zip', 'r') as zip_ref:
    zip_ref.extractall('Covid19_data')
df = pd.read_csv('Covid19_data/covid_19_clean_complete.csv')

display(df.head())

print("Shape:", df.shape)

df.info()

2.2- Check Missing Values and Duplicates

In [None]:
print("Missing Values per Column:")
print(df.isnull().sum())
print("Number of Duplicate Rows:", df.duplicated().sum())

2.3- Descriptive Statistics

In [None]:
print("Numeric Columns Statistics:")
display(df.describe())
print("Categorical Columns Statistics:")
display(df.describe(include=['object']))

2.4- Visualizations

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="whitegrid")

2.4.1 – Monthly Trend of COVID-19 Cases (Confirmed, Recovered, and Deaths)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df['Month'] = pd.to_datetime(df['Date']).dt.month
month_names = {
    1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun',
    7:'Jul', 8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'
}
df['Month_Name'] = df['Month'].map(month_names)
monthly_data = df.groupby('Month_Name')[['Confirmed', 'Recovered', 'Deaths']].sum().reindex(
    ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
)
plt.figure(figsize=(12,6))
sns.lineplot(data=monthly_data, linewidth=2.5, markers=True)
plt.title('Monthly Trend of COVID-19 Cases (Confirmed, Recovered, Deaths)')
plt.xlabel('Month')
plt.ylabel('Count')
plt.grid(True, linestyle='--', alpha=0.6)
plt.legend(['Confirmed', 'Recovered', 'Deaths'])
plt.show()

2.4.2- Correlation Heatmap

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
corr = df[numeric_cols].corr()

plt.figure(figsize=(10,8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Heatmap Between Numerical Features', fontsize=14, pad=15)
plt.show()

2.4.3 – Visualizing Relationship with Active Cases

Confirmed vs Active

In [None]:
plt.figure(figsize=(6,4))
sns.scatterplot(
    x='Confirmed',
    y='Active',
    hue='WHO Region',
    data=df,
    alpha=0.6
)
plt.title("Active vs Confirmed Cases by WHO Region")
plt.xlabel("Confirmed Cases")
plt.ylabel("Active Cases")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


Recovered vs Active

In [None]:
sns.lmplot(
    x='Deaths',
    y='Active',
    data=df,
    hue='WHO Region',
    height=4,
    aspect=1.3,
    scatter_kws={'alpha':0.5}
)
plt.title("Active vs Deaths with Trend by Region")
plt.xlabel("Deaths")
plt.ylabel("Active Cases")
plt.show()


2.4.4- Geographical Spread (Infections & Deaths)

In [None]:
import plotly.express as px

df_country = df.groupby('Country/Region', as_index=False)['Confirmed'].max()

fig = px.choropleth(
    df_country,
    locations="Country/Region",
    locationmode="country names",
    color="Confirmed",
    color_continuous_scale="Reds",
    title="Global Map of COVID-19 Confirmed Cases (by country)"
)
fig.show()


 Step 2.4.5 – Composite Pandemic Impact Score (CPIS)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# aggregate data by WHO Region
region_df = (
    df.groupby('WHO Region')[['Confirmed', 'Deaths', 'Recovered']]
      .sum()
      .reset_index()
)

# compute components
region_df['Severity'] = region_df['Deaths'] / (region_df['Confirmed'] + 1)
region_df['Recovery_Weakness'] = 1 - (region_df['Recovered'] / (region_df['Confirmed'] + 1))
region_df['Spread'] = region_df['Confirmed']

# normalize components to 0–1
for col in ['Severity', 'Recovery_Weakness', 'Spread']:
    mn = region_df[col].min()
    mx = region_df[col].max()
    region_df[col + '_norm'] = (region_df[col] - mn) / (mx - mn + 1e-9)

# final composite score
region_df['CPIS'] = (
    region_df['Severity_norm'] * 0.4 +
    region_df['Recovery_Weakness_norm'] * 0.3 +
    region_df['Spread_norm'] * 0.3
)

# sort and plot
region_df = region_df.sort_values('CPIS', ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(x='CPIS', y='WHO Region', data=region_df, palette='magma')
plt.title('Composite Pandemic Impact Score (CPIS) by WHO Region', fontsize=14, pad=15)
plt.xlabel('CPIS (0–1)')
plt.ylabel('WHO Region')
plt.show()

**Step-3: Data Preparation**

3.1- Handle Missing Values

In [None]:
print("Missing values per column:")
print(df.isnull().sum())
df.fillna(0, inplace=True)
print("After filling missing values:")
print(df.isnull().sum())

3.2- Encode Target Variable

In [None]:
df['Infected'] = df['Active'].apply(lambda x: 'Not Infected' if x == 0 else 'Infected')

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Infected_encoded'] = le.fit_transform(df['Infected'])
print(df[['Infected', 'Infected_encoded']].head())

3.3- Select Features

In [None]:
features = ['Confirmed', 'Deaths', 'Recovered', 'Lat', 'Long']
X = df[features]
y = df['Infected_encoded']
print("Features shape:", X.shape)
print("Target shape:", y.shape)

3.4- Scale the Features

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("Scaled features sample:\n", X_scaled[:5])

3.5- Split the Dataset

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42
)
print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)

3.6- Percentage of Missing Values Handled

In [None]:
# Missing values before filling
missing_before = df.isnull().sum()
total_rows = len(df)
missing_after = df.isnull().sum()

# Calculate percentage of processed missing values
processed_percentage = (missing_before - missing_after) / total_rows * 100
print("Percentage of missing values handled per column:")
print(processed_percentage)

**Step-4: Build the Model (Logistic Regression – Binary Classification)**

4.1- Import Model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

4.2- Handle Imbalanced Data using SMOTE

Option 1: SMOOT

In [None]:
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)
print("SMOTE ->", X_train_sm.shape)

“Option 2: Under Sampling”

In [None]:
rus = RandomUnderSampler(random_state=42)
X_train_under, y_train_under = rus.fit_resample(X_train, y_train)
print("Under-sampling ->", X_train_under.shape)

4.2.1- Compare SMOTE vs Under-Sampling performance for Logistic Regression


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

log_sm = LogisticRegression(random_state=42)
log_sm.fit(X_train_sm, y_train_sm)
y_pred_sm = log_sm.predict(X_test)

log_under = LogisticRegression(random_state=42)
log_under.fit(X_train_under, y_train_under)
y_pred_under = log_under.predict(X_test)

def evaluate_model(name, y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    print(f"\n{name}")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1-score:  {f1:.4f}")

evaluate_model("Logistic Regression (SMOTE)", y_test, y_pred_sm)
evaluate_model("Logistic Regression (Under-Sampling)", y_test, y_pred_under)


4.3- Initialize and Train the Model

Option 1: Logistic Regression

In [None]:
rus = RandomUnderSampler(random_state=42)
X_train_res, y_train_res = rus.fit_resample(X_train, y_train)
X_train_final = X_train_under
y_train_final = y_train_under

In [None]:
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train_final, y_train_final)

Option 2 (Recommended): Random Forest Classifier

In [None]:
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf.fit(X_train_final, y_train_final)

4.4- **BONUS: Two-Stage Classification Pipeline**

In [None]:
# stage 1: fast / simple model
stage1_pred = logreg.predict(X_test)

# find samples that were misclassified by logistic regression
mis_idx = stage1_pred != y_test

# stage 2: re-predict only the hard samples using Random Forest
stage2_pred = rf.predict(X_test[mis_idx])

# combine
final_pred = stage1_pred.copy()
final_pred[mis_idx] = stage2_pred

# evaluate final predictions
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print("Two-Stage Pipeline Results:")
print(f"Accuracy: {accuracy_score(y_test, final_pred):.4f}")
print(f"Precision: {precision_score(y_test, final_pred):.4f}")
print(f"Recall: {recall_score(y_test, final_pred):.4f}")
print(f"F1-score: {f1_score(y_test, final_pred):.4f}")


4.4.1- **Confusion Matrix – Two-Stage Pipeline**

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm_bonus = confusion_matrix(y_test, final_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_bonus,
                              display_labels=['Not Infected', 'Infected'])
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix – Two-Stage Pipeline")
plt.show()


4.5- Make Predictions on Test Set

In [None]:
y_pred_logreg = logreg.predict(X_test)
y_pred_rf = rf.predict(X_test)

4.6- Compare Predictions with Actual Values

In [None]:
import pandas as pd

comparison_logreg = pd.DataFrame({
    'Actual': y_test[:10].values,
    'Predicted_LogReg': y_pred_logreg[:10]
})
comparison_rf = pd.DataFrame({
    'Actual': y_test[:10].values,
    'Predicted_RF': y_pred_rf[:10]
})

print("Logistic Regression Predictions:\n", comparison_logreg)
print("Random Forest Predictions:\n", comparison_rf)


**Step-5: Model Evaluation (Classification)**

5.1- Import Evaluation Metrics

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc
import matplotlib.pyplot as plt

5.2- Calculate and Compare Metrics (Logistic Regression vs Random Forest)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

acc_log = accuracy_score(y_test, y_pred_logreg)
prec_log = precision_score(y_test, y_pred_logreg)
rec_log = recall_score(y_test, y_pred_logreg)
f1_log = f1_score(y_test, y_pred_logreg)

acc_rf = accuracy_score(y_test, y_pred_rf)
prec_rf = precision_score(y_test, y_pred_rf)
rec_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

print("=== Model Evaluation Metrics ===")
print(f"{'Metric':<12}{'Logistic Regression':<25}{'Random Forest'}")
print(f"{'-'*50}")
print(f"{'Accuracy':<12}{acc_log:<25.4f}{acc_rf:.4f}")
print(f"{'Precision':<12}{prec_log:<25.4f}{prec_rf:.4f}")
print(f"{'Recall':<12}{rec_log:<25.4f}{rec_rf:.4f}")
print(f"{'F1-score':<12}{f1_log:<25.4f}{f1_rf:.4f}")


5.3- Confusion Matrix Visualization

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

cm_log = confusion_matrix(y_test, y_pred_logreg)
cm_rf  = confusion_matrix(y_test, y_pred_rf)

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

disp1 = ConfusionMatrixDisplay(confusion_matrix=cm_log,
                               display_labels=['Not Infected', 'Infected'])
disp1.plot(ax=axes[0], cmap=plt.cm.Oranges, colorbar=False)
axes[0].set_title("Logistic Regression Confusion Matrix")

disp2 = ConfusionMatrixDisplay(confusion_matrix=cm_rf,
                               display_labels=['Not Infected', 'Infected'])
disp2.plot(ax=axes[1], cmap=plt.cm.Blues, colorbar=False)
axes[1].set_title("Random Forest Confusion Matrix")

plt.tight_layout()
plt.show()


5.4- ROC Curve

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

y_prob_log = logreg.predict_proba(X_test)[:, 1]
y_prob_rf  = rf.predict_proba(X_test)[:, 1]


fpr_log, tpr_log, _ = roc_curve(y_test, y_prob_log)
fpr_rf,  tpr_rf,  _ = roc_curve(y_test, y_prob_rf)

auc_log = auc(fpr_log, tpr_log)
auc_rf  = auc(fpr_rf, tpr_rf)

plt.figure(figsize=(8,6))
plt.plot(fpr_log, tpr_log, label=f"Logistic Regression (AUC = {auc_log:.2f})")
plt.plot(fpr_rf,  tpr_rf,  label=f"Random Forest (AUC = {auc_rf:.2f})")

plt.plot([0,1], [0,1], 'k--')

plt.title("ROC Curves for Different Models")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="lower right")
plt.grid(True, linestyle="--", alpha=0.5)
plt.show()
