In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 50)

In [2]:
def get_dataframe(name):
    
    file_name = f'Output/{name}.csv'
    return pd.read_csv(file_name)

dataset = get_dataframe('dataset_filtered')
dataset_ind = get_dataframe('individual record')
dataset_dis = get_dataframe('district record')
display(dataset_dis[dataset_dis['Current Status'] == 'Deceased'])

Unnamed: 0,Serial Id,Day Id,Status Day Id,Date Announced,Age Bracket,Gender,Detected State,Current Status,Status Change Date,Num Cases
239,26144,57,,27/04/2020,,,Maharashtra,Deceased,,14.0
258,26171,57,,27/04/2020,,,Madhya Pradesh,Deceased,,3.0
259,26173,57,,27/04/2020,,,Madhya Pradesh,Deceased,,3.0
265,26183,57,,27/04/2020,,,Rajasthan,Deceased,,2.0
284,26208,57,,27/04/2020,,,Maharashtra,Deceased,,6.0
...,...,...,...,...,...,...,...,...,...,...
413970,646575,475,,19/06/2021,,,Punjab,Deceased,,3.0
413971,646577,475,,19/06/2021,,,Punjab,Deceased,,2.0
413972,646578,475,,19/06/2021,,,Punjab,Deceased,,2.0
413985,646597,475,,19/06/2021,,,Manipur,Deceased,,3.0


In [3]:
def get_template():
    '''Returns an empty dataframe with required columns'''
    
    columns = ['Day Id', 'Total Number of cases', 'Total Number of fatalities',
               'Total Number of cases(Females)', 'Total Number of cases(Males)',
               'Total Number of fatalities(Females)', 'Total Number of fatalities(Males)',
               
               'Age (1-10)', 'Age (11-20)', 'Age (21-30)', 'Age (31-40)', 'Age (41-50)', 
               'Age (51-60)', 'Age (61-70)', 'Age (71-80)', 'Age (81 and above)',
               
               'Age (1-10) Female', 'Age (11-20) Female', 'Age (21-30) Female', 'Age (31-40) Female', 'Age (41-50) Female', 
               'Age (51-60) Female', 'Age (61-70) Female', 'Age (71-80) Female', 'Age (81 and above) Female',
               
               'Age (1-10) Male', 'Age (11-20) Male', 'Age (21-30) Male', 'Age (31-40) Male', 'Age (41-50) Male', 
               'Age (51-60) Male', 'Age (61-70) Male', 'Age (71-80) Male', 'Age (81 and above) Male'              
              ]
    
    return pd.DataFrame(columns=columns)

In [7]:
def each_row(day_from_first):

    required_df = dataset[dataset['Day Id'] == day_from_first]  # Required dataframe from full dataset
    required_status_df = dataset_dis[dataset_dis['Status Day Id'] == day_from_first]
     
    required_dis_df = dataset_dis[dataset_dis['Day Id'] == day_from_first]  # Required dataframe from district record dataset
    
    
    required_ind_df = dataset_ind[dataset_ind['Day Id'] == day_from_first]  # Required dataframe from individual level dataset
    
    required_df['Age Bracket'] = pd.to_numeric(required_df['Age Bracket'], errors='coerce')
    required_df['Age Bracket'] = required_df['Age Bracket'].astype('float')

    def get_cases_district():
        return required_dis_df['Num Cases'].sum()
    
    def get_fatalities_disrtict():
        record = required_dis
        return required_dis_df['Current Status']
    
    def get_covid_cases():
        return required_ind_df.shape[0] + get_cases_district()

    def no_fatalities():
        return required_status_df[required_status_df['Current Status'] == 'Deceased'].shape[0]

    def no_cases_gender(gender):
        return required_df[required_df['Gender'] == gender].shape[0]

    def no_fatalities_gender(gender):
        gender_df = required_status_df[required_status_df['Gender'] == gender]
        return gender_df[gender_df['Current Status'] == 'Deceased'].shape[0]

    def get_age_distribution(lower, upper):
        return required_df[(required_df['Age Bracket'] >= lower) & (required_df['Age Bracket'] <= upper)].shape[0]
    
    def get_age_distribution_gender(gender, lower, upper):
        gender_df = required_df[required_df['Gender'] == gender]
        return gender_df[(gender_df['Age Bracket'] >= lower) & (gender_df['Age Bracket'] <= upper)].shape[0]
    

    new_row = dict()
    new_row['Day Id'] = day_from_first
    new_row['Total Number of cases'] = get_covid_cases()
    new_row['Total Number of cases(Males)'] = no_cases_gender('M')
    new_row['Total Number of cases(Females)'] = no_cases_gender('F')
    new_row['Total Number of fatalities'] = no_fatalities()
    new_row['Total Number of fatalities(Males)'] = no_fatalities_gender('M')
    new_row['Total Number of fatalities(Females)'] = no_fatalities_gender('F')
    
    for i in range(1, 81, 10):
        new_row[f'Age ({i}-{i+9})'] = get_age_distribution(i, i+9)
        new_row[f'Age ({i}-{i+9}) Female'] = get_age_distribution_gender('F', i, i+9)        
        new_row[f'Age ({i}-{i+9}) Male'] = get_age_distribution_gender('M', i, i+9)
    
    new_row['Age (81 and above)'] = get_age_distribution(81, 120)
    new_row['Age (81 and above) Female'] = get_age_distribution_gender('F', 81, 120)
    new_row['Age (81 and above) Male'] = get_age_distribution_gender('M', 81, 120)
    

    return new_row

In [10]:
def append_row():
    day_id = dataset['Day Id'].unique()

    processed_df = get_template()
    for day in day_id:
        processed_df = processed_df.append(each_row(day), ignore_index=True, sort=False)
    display(processed_df)
    return processed_df

processed_df = append_row()

Unnamed: 0,Day Id,Total Number of cases,Total Number of fatalities,Total Number of cases(Females),Total Number of cases(Males),Total Number of fatalities(Females),Total Number of fatalities(Males),Age (1-10),Age (11-20),Age (21-30),Age (31-40),Age (41-50),Age (51-60),Age (61-70),Age (71-80),Age (81 and above),Age (1-10) Female,Age (11-20) Female,Age (21-30) Female,Age (31-40) Female,Age (41-50) Female,Age (51-60) Female,Age (61-70) Female,Age (71-80) Female,Age (81 and above) Female,Age (1-10) Male,Age (11-20) Male,Age (21-30) Male,Age (31-40) Male,Age (41-50) Male,Age (51-60) Male,Age (61-70) Male,Age (71-80) Male,Age (81 and above) Male
0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2,3.0,13.0,0.0,2.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,5.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,472.0,164413.0,0.0,57.0,86.0,0.0,0.0,0.0,1.0,4.0,5.0,14.0,29.0,36.0,36.0,18.0,0.0,0.0,1.0,0.0,6.0,15.0,11.0,15.0,9.0,0.0,1.0,3.0,5.0,8.0,14.0,25.0,21.0,9.0
472,473.0,148371.0,0.0,39.0,47.0,0.0,0.0,0.0,0.0,0.0,7.0,8.0,14.0,23.0,24.0,10.0,0.0,0.0,0.0,4.0,2.0,9.0,8.0,13.0,3.0,0.0,0.0,0.0,3.0,6.0,5.0,15.0,11.0,7.0
473,474.0,159951.0,0.0,40.0,52.0,0.0,0.0,0.0,0.0,1.0,3.0,8.0,13.0,31.0,23.0,13.0,0.0,0.0,1.0,1.0,2.0,5.0,11.0,12.0,8.0,0.0,0.0,0.0,2.0,6.0,8.0,20.0,11.0,5.0
474,475.0,147740.0,0.0,37.0,80.0,0.0,0.0,0.0,0.0,0.0,7.0,17.0,17.0,30.0,25.0,21.0,0.0,0.0,0.0,2.0,5.0,3.0,13.0,10.0,4.0,0.0,0.0,0.0,5.0,12.0,14.0,17.0,15.0,17.0


In [11]:
def save_to_csv(df, name):
    path = f'Output/{name}.csv'
    df.to_csv(path, index=False)

save_to_csv(processed_df, 'classified')