## Covid Data Analysis


###  Importing the required dependencies

In [1]:
import os
import pandas as pd
from datetime import date
import math

### About the dataset

* Date format: DD/MM/YYYY

### Getting the list of raw datasets

In [13]:
def get_file():
    '''
        Returns the path of files in chronological order
    '''
    
    count = len(os.listdir('Raw_Datasets'))
    path = 'Raw_Datasets/raw_data'
    dataset = [f'{path}{i}.csv' for i in range(1, count+1)]
    return dataset

### Template for storing processed data

In [41]:
def get_empty_df():
    '''
        Returns empty dataframe with the required columns
    '''
    
    columns = ['Day Id', 'Status Day Id', 'Patient Number','Entry_ID', 'State Patient Number', 'Date Announced',
       'Estimated Onset Date', 'Age Bracket', 'Gender', 'Detected City',
       'Detected District', 'Detected State', 'State code', 'Current Status',
       'Notes', 'Contracted from which Patient (Suspected)', 'Nationality',
       'Type of transmission', 'Status Change Date', 'Source_1', 'Source_2',
       'Source_3', 'Backup Notes', 'Num Cases']
    
    return pd.DataFrame(columns=columns)

### Number of days from 1st March (Taking the date as the day when first case of covid was reported)

In [42]:
def get_date_interval(day):
    '''
        Returns number of days from 1 March
    '''
    if pd.isna(day):
        return 
    day = str(day)
    DD, MM, YYYY = (map(int, day.split('/'))) if '/' in day else (map(int, day.split('-'))) 
#     print(DD, MM, YYYY)
    inception = date(2020, 3, 1)  # 1st March 2020
    current = date(YYYY, MM, DD)
    delta = current - inception
    return delta.days if delta.days >= 0 else 0

### Adding a new column which stores the number of days from 1st March 2020

In [43]:
def process_df(file):
    df = pd.read_csv(file)
    day_id = list(map(get_date_interval, df['Date Announced']))  # Mapping Date Announce column to get the date interval
    df.insert(0, 'Day Id', day_id)
    status_day_id = list(map(get_date_interval, df['Status Change Date']))  # Mapping Status Change column to get the date interval
    df.insert(1, 'Status Day Id', status_day_id)
    return df

### Appending the dateframe to a single dataframe

In [48]:
def add_df():
    new_data = get_empty_df()
    files = get_file()
    for file in files:
        processed = process_df(file)  # Processing each file to add day_id column
        new_data = new_data.append(processed)  # Appending each processed dataframe to a single dataframe
        
    new_data.reset_index(inplace=True, drop=True)  # Resetting the index column
    new_data.index += 1
    new_data.insert(0, 'Serial Id', new_data.index)  # Adding a new serial id column which is unique and can be used to join different column
    return new_data

# combined_df = add_df()

### Saving the dataframe to a csv file

In [49]:
def save_to_csv(df):
    folder_name = 'Output'
    df.reset_index(inplace = True, drop=True)
    if not os.access(folder_name, os.F_OK):
        os.mkdir(folder_name)
    out = f'{folder_name}/processed.csv'
    df.to_csv(out, index=False)

# save_to_csv(combined_df)

### Dropping the unnecessary columns

In [46]:
def drop_col(df):
    col_to_drop = [
        'Patient Number', 'Entry_ID', 'State Patient Number', 
        'Estimated Onset Date', 'Detected City', 'Detected District',
        'State code', 'Notes', 'Contracted from which Patient (Suspected)',
        'Nationality', 'Type of transmission', 'Source_1', 'Source_2',
        'Source_3', 'Backup Notes', 'Num Cases'
    ]
    
    df.drop(col_to_drop, axis=1, inplace=True)
    return df

# filtered_df = drop_col(combined_df)

### Saving the new file

In [47]:
def save_to_csv_new(df):
    path = 'Output/dataset_filtered.csv'
    df.to_csv(path, index=False)
    
# save_to_csv_new(filtered_df)