## COSC 3337: Data Science I - Group Project
### Team: Naomi Ayub, Elyjaiah Durden, Nirmal John
### Task Category: Classification

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from collections import defaultdict
from collections import Counter
from scipy.cluster.hierarchy import linkage, dendrogram

In [None]:
# -------------------------PREPROCESSING OF DATA-------------------------
# Load dataset
df = pd.read_csv('owid_covid_data.csv')

# Basic cleaning
df['date'] = pd.to_datetime(df['date'])  # Ensure 'date' is datetime type

# Forward-fill total_cases and total_deaths for each location
df[['total_cases', 'total_deaths']] = df.groupby('location')[['total_cases', 'total_deaths']].ffill().copy()

# Fill NA in "new_cases" and "new_deaths" with 0 where total_cases/total_deaths exist
df.loc[df['total_cases'].notna(), 'new_cases'] = df.loc[df['total_cases'].notna(), 'new_cases'].fillna(0)
df.loc[df['total_deaths'].notna(), 'new_deaths'] = df.loc[df['total_deaths'].notna(), 'new_deaths'].fillna(0)

# Drop rows where 'total_cases' or 'total_deaths' is still NaN (if any)
df = df.dropna(subset=['total_cases', 'total_deaths'])

# Create new features safely
df['daily_case_change_rate'] = df['new_cases'] / df['total_cases'].replace(0, pd.NA)
df['daily_case_change_rate'] = pd.to_numeric(df['daily_case_change_rate'], errors='coerce')

df['daily_death_change_rate'] = df['new_deaths'] / df['total_deaths'].replace(0, pd.NA)
df['daily_death_change_rate'] = pd.to_numeric(df['daily_death_change_rate'], errors='coerce')

df['hospitalization_rate'] = df['hosp_patients'] / df['population']
df['icu_rate'] = df['icu_patients'] / df['population']

df['case_fatality_rate'] = df['total_deaths'] / df['total_cases']
df['case_fatality_rate'] = pd.to_numeric(df['case_fatality_rate'], errors='coerce')

# Handle infinite values (if any due to division by zero)
df = df.replace([float('inf'), -float('inf')], pd.NA)

# Fill NaNs created in new features with 0 or leave as NaN depending on context
columns_to_fill = ['daily_case_change_rate', 'daily_death_change_rate', 'hospitalization_rate', 'icu_rate', 'case_fatality_rate']
df[columns_to_fill] = df[columns_to_fill].fillna(0)

# Save preprocessed data
df.to_csv('preprocessed_covid_data.csv', index=False)

# Final check
print("Shape of preprocessed data:", df.shape)
print("\nData Info:")
print(df.info())
print("\nStatistical Summary:")
print(df.describe())
print("\nMissing values per column:")
print(df.isna().sum())

Shape of preprocessed data: (412909, 72)

Data Info:
<class 'pandas.core.frame.DataFrame'>
Index: 412909 entries, 0 to 429434
Data columns (total 72 columns):
 #   Column                                      Non-Null Count   Dtype         
---  ------                                      --------------   -----         
 0   iso_code                                    412909 non-null  object        
 1   continent                                   392784 non-null  object        
 2   location                                    412909 non-null  object        
 3   date                                        412909 non-null  datetime64[ns]
 4   total_cases                                 412909 non-null  float64       
 5   new_cases                                   412909 non-null  float64       
 6   new_cases_smoothed                          408929 non-null  float64       
 7   total_deaths                                412909 non-null  float64       
 8   new_deaths                

In [None]:
# -------------------------CLASSIFICATION-------------------------
