<a href="https://colab.research.google.com/github/Benashael/COVID-19/blob/main/Preprocessing_COVID_19_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd

# Load your dataset (assuming it's in a CSV file)
df = pd.read_csv('/content/drive/MyDrive/Provisional_COVID-19_Deaths_by_Sex_and_Age_20240702.csv')

# Filter the dataset where the 'Group' column is 'By Month'
filtered_df = df[df['Group'] == 'By Month']

filtered_df.shape

(123930, 16)

In [5]:
import numpy as np

# If it's not in datetime format, convert it
filtered_df['Start Date'] = pd.to_datetime(filtered_df['Start Date'])

# Sort the dataset by date to maintain the time series order
filtered_df = filtered_df.sort_values(by='Start Date')

# Perform stratified sampling to reduce the dataset to 20,000 rows
# Group by date to ensure we sample proportionally from each date
sampled_df = filtered_df.groupby(filtered_df['Start Date'].dt.date, group_keys=False).apply(lambda x: x.sample(frac=20000/len(df), random_state=1))

# Ensure the sample has exactly 20,000 rows
if len(sampled_df) > 20000:
    sampled_df = sampled_df.sample(n=20000, random_state=1)
elif len(sampled_df) < 20000:
    additional_rows = df.sample(n=20000 - len(sampled_df), random_state=1)
    sampled_df = pd.concat([sampled_df, additional_rows])

# Reset index for clean dataframe
sampled_df = sampled_df.reset_index(drop=True)

sampled_df.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Start Date'] = pd.to_datetime(filtered_df['Start Date'])


(20000, 16)

In [10]:
# Initial Feature Removal
sampled_df = sampled_df.drop(columns=['Footnote'])

In [11]:
sampled_df.isna().sum()

Data As Of                                     0
Start Date                                     0
End Date                                       0
Group                                          0
Year                                          46
Month                                        213
State                                          0
Sex                                            0
Age Group                                      0
COVID-19 Deaths                             5798
Total Deaths                                3083
Pneumonia Deaths                            6736
Pneumonia and COVID-19 Deaths               5377
Influenza Deaths                            3427
Pneumonia, Influenza, or COVID-19 Deaths    6679
dtype: int64

In [14]:
# Handle NaN values

# Forward fill or backward fill for 'Year' and 'Month'
sampled_df['Year'] = sampled_df['Year'].fillna(method='ffill').fillna(method='bfill')
sampled_df['Month'] = sampled_df['Month'].fillna(method='ffill').fillna(method='bfill')

# Interpolation for death-related columns
death_columns = [
    'COVID-19 Deaths',
    'Total Deaths',
    'Pneumonia Deaths',
    'Pneumonia and COVID-19 Deaths',
    'Influenza Deaths',
    'Pneumonia, Influenza, or COVID-19 Deaths'
]
sampled_df[death_columns] = sampled_df[death_columns].interpolate(method='linear')

# Ensure no remaining NaN values by applying forward fill and backward fill again
sampled_df[death_columns] = sampled_df[death_columns].fillna(method='ffill').fillna(method='bfill')

In [15]:
sampled_df.isna().sum()

Data As Of                                  0
Start Date                                  0
End Date                                    0
Group                                       0
Year                                        0
Month                                       0
State                                       0
Sex                                         0
Age Group                                   0
COVID-19 Deaths                             0
Total Deaths                                0
Pneumonia Deaths                            0
Pneumonia and COVID-19 Deaths               0
Influenza Deaths                            0
Pneumonia, Influenza, or COVID-19 Deaths    0
dtype: int64

In [17]:
# Final Feature Removal
sampled_df = sampled_df.drop(columns=['Data As Of', 'Start Date', 'Group', ])

In [19]:
# Save the reduced dataset to a new CSV file
sampled_df.to_csv('reduced_dataset.csv', index=False)