## Covid Data Analysis


###  Importing the required dependencies

In [1]:
import os
import pandas as pd
import numpy as np
from datetime import date
import math

### About the dataset

* Date format: DD/MM/YYYY
* Starting day of report: 30 January 2020
* Last day of report: 20 June 2021

### Getting the list of raw datasets

In [2]:
def get_file():
    '''
        Returns the path of files in chronological order
    '''
    
    count = len(os.listdir('Raw_Datasets'))
    path = 'Raw_Datasets/raw_data'
    dataset = [f'{path}{i}.csv' for i in range(1, count+1)]
    return dataset

### Template for storing processed data

In [3]:
def get_empty_df():
    '''
        Returns empty dataframe with the required columns
    '''
    
    columns = ['Day Id', 'Status Day Id', 'Patient Number','Entry_ID', 'State Patient Number', 'Date Announced',
       'Estimated Onset Date', 'Age Bracket', 'Gender', 'Detected City',
       'Detected District', 'Detected State', 'State code', 'Current Status',
       'Notes', 'Contracted from which Patient (Suspected)', 'Nationality',
       'Type of transmission', 'Status Change Date', 'Source_1', 'Source_2',
       'Source_3', 'Backup Notes', 'Num Cases']
    
    return pd.DataFrame(columns=columns)

### Number of days from 1st March (Taking the date as the day when first case of covid was reported)

In [4]:
def get_date_interval(day):
    '''
        Returns number of days from 1 March
    '''
    if pd.isna(day):
        return 
    day = str(day)
    DD, MM, YYYY = (map(int, day.split('/'))) if '/' in day else (map(int, day.split('-'))) 
#     print(DD, MM, YYYY)
    inception = date(2020, 3, 1)  # 1st March 2020
    current = date(YYYY, MM, DD)
    delta = current - inception
    return delta.days if delta.days >= 0 else 0

### Adding a new column which stores the number of days from 1st March 2020

In [5]:
def process_df(file):
    df = pd.read_csv(file)
    day_id = list(map(get_date_interval, df['Date Announced']))  # Mapping Date Announce column to get the date interval
    df.insert(0, 'Day Id', day_id)
    status_day_id = list(map(get_date_interval, df['Status Change Date']))  # Mapping Status Change column to get the date interval
    df.insert(1, 'Status Day Id', status_day_id)
    return df

### Appending the dateframe to a single dataframe

In [12]:
def add_df():
    new_data = get_empty_df()
    files = get_file()
    for file in files:
        processed = process_df(file)  # Processing each file to add day_id column
        new_data = new_data.append(processed)  # Appending each processed dataframe to a single dataframe
    
    new_data.drop(new_data[new_data['Num Cases'] == 0].index, inplace = True)
    new_data.reset_index(inplace=True, drop=True)  # Resetting the index column
    new_data.index += 1
    new_data.insert(0, 'Serial Id', new_data.index)  # Adding a new serial id column which is unique and can be used to join different column
    return new_data

combined_df = add_df()

### Saving the dataframe to a csv file

In [13]:
def save_to_csv(df):
    folder_name = 'Output'
    df.reset_index(inplace = True, drop=True)
    if not os.access(folder_name, os.F_OK):
        os.mkdir(folder_name)
    out = f'{folder_name}/processed.csv'
    df.to_csv(out, index=False)

save_to_csv(combined_df)

### Dropping the unnecessary columns

In [14]:
def drop_col(df):
    col_to_drop = [
        'Patient Number', 'Entry_ID', 'State Patient Number', 
        'Estimated Onset Date', 'Detected City', 
        'State code', 'Notes', 'Contracted from which Patient (Suspected)',
        'Nationality', 'Type of transmission', 'Source_1', 'Source_2',
        'Source_3', 'Backup Notes'
    ]
    
    df.drop(col_to_drop, axis=1, inplace=True)
    return df

filtered_df = drop_col(combined_df)

### Saving the new file

In [15]:
def save_to_csv_new(df, name):
    path = f'Output/{name}.csv'
    df.to_csv(path, index=False)
    
save_to_csv_new(filtered_df, 'dataset_filtered')

### Splitting the dataset into district level record and individual record


In [16]:
def split_dataset(df):
    individual_record = df[(df['Num Cases'] == 1) | (df['Num Cases'].isna())]
    district_record = df[(df['Num Cases'] != 1) & (df['Num Cases'].notnull())]# != np.nan)]
    display(individual_record)
    display(district_record)
    save_to_csv_new(individual_record, 'individual record')
    save_to_csv_new(district_record, 'district record')

split_dataset(filtered_df)

Unnamed: 0,Serial Id,Day Id,Status Day Id,Date Announced,Age Bracket,Gender,Detected District,Detected State,Current Status,Status Change Date,Num Cases
0,1,0,0.0,02/02/2020,,,Alappuzha,Kerala,Recovered,14/02/2020,1
1,2,0,0.0,03/02/2020,,,Kasaragod,Kerala,Recovered,14/02/2020,1
2,3,1,14.0,02/03/2020,45,M,East Delhi,Delhi,Recovered,15/03/2020,1
3,4,1,1.0,02/03/2020,24,M,Hyderabad,Telangana,Recovered,02/03/2020,1
4,5,3,28.0,04/03/2020,55,,Italians,Haryana,Recovered,29/03/2020,1
...,...,...,...,...,...,...,...,...,...,...,...
647224,647225,476,,20/06/2021,,,,,,,
647225,647226,476,,20/06/2021,,,,,,,
647226,647227,476,,20/06/2021,,,,,,,
647227,647228,476,,20/06/2021,,,,,,,


Unnamed: 0,Serial Id,Day Id,Status Day Id,Date Announced,Age Bracket,Gender,Detected District,Detected State,Current Status,Status Change Date,Num Cases
15830,15831,44,,14/04/2020,,,S.P.S. Nellore,Andhra Pradesh,Hospitalized,,-1
15833,15834,48,,18/04/2020,,,,Madhya Pradesh,Hospitalized,,-1
15834,15835,48,,18/04/2020,,,Alirajpur,Madhya Pradesh,Hospitalized,,2
15835,15836,48,,18/04/2020,,,,Madhya Pradesh,Hospitalized,,-2
15836,15837,49,,19/04/2020,,,Gurdaspur,Punjab,Hospitalized,,-1
...,...,...,...,...,...,...,...,...,...,...,...
646787,646788,475,,19/06/2021,,,Kasaragod,Kerala,Recovered,,533.0
646904,646905,475,,19/06/2021,,,Leh,Ladakh,Hospitalized,,51.0
646905,646906,475,,19/06/2021,,,Kargil,Ladakh,Hospitalized,,2.0
646906,646907,475,,19/06/2021,,,Leh,Ladakh,Recovered,,98.0
