#### Importing important data
In this code I am trying to import the data as efficient as possible.
The data I am particulary interested in can be found in the google sheet document "interesting data"
The categories that are defined so far are:
###### Demographics
1. Literacy
2. population density
3. slum population
4. Urban population share
###### Deaths
5. covid infections
6. covid deaths
7. dates
8. district id
###### Vaccination
9. site vaccination progress
10. first dose vaccination
11. dates
12. district id
###### Age
13. age per 5 year categories to make a map of where older people live

In [169]:
import pandas as pd
import numpy as np

In [170]:
#As always we have to specify to jupyter that we want to see all the outputs.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.options.display.max_rows = 20


In [171]:
# We need to be able to access the right folder. I put mine in the following folders. This can be adjusted easily
# Keep in mind that the 'r' in front of the string converts
# the string to a 'raw' string, bc of conflicts with \ and " ' characters

# Base directory
base_dir =       r"./covid_data"

# This section specifies the file name directories that I want to use
covid_folder =  r"/covid/csv"
demog_folder =  r"/demography/csv"
hosp_folder =  r"/hospitals/csv"

# This section identifies the actual files
deaths_file =   r"/covid_infected_deaths_pc11.csv"
vacc_file =     r"/covid_vaccination.csv" #we cant use this because it contains new district numbering
demog_file =    r"/pc11_demographics_district.csv"
age_file =      r"/age_bins_district_t_pc11.csv"
hosp_file =     r"/dlhs4_hospitals_dist_pc11.csv"
names_file =     r"/shrug_pc11_district_key.csv"

deaths_dir = base_dir + covid_folder + deaths_file
vacc_dir =   base_dir + covid_folder + vacc_file #NA
demog_dir =  base_dir + demog_folder + demog_file
age_dir =    base_dir + demog_folder + age_file
hosp_dir =   base_dir + hosp_folder + hosp_file
names_dir =  base_dir + names_file

dirlist = [deaths_dir, vacc_dir, demog_dir, age_dir, hosp_dir, names_dir]


In [172]:
# Here is the overview of interesting variables that we want to use for our first analysis.
interesting_dir = base_dir + r"/variablecodes.xlsx"
interesting_frame = pd.read_excel(interesting_dir)
interesting_frame['code']

0             pc11_pca_p_lit
1              pc11_pop_dens
2              pc11_slum_pop
3             pc11_urb_share
4                total_cases
5               total_deaths
6                       date
7            lgd_district_id
8                total_sites
9           first_dose_admin
10                      date
11           lgd_district_id
12                       NaN
13          pc11_district_id
14         dlhs4_dh_icu_beds
15          dlhs4_total_beds
16    dlhs4_total_facilities
Name: code, dtype: object

In [173]:
deaths_frame = pd.read_csv(deaths_dir)
vacc_frame   = pd.read_csv(vacc_dir)
demog_frame  = pd.read_csv(demog_dir)
age_frame    = pd.read_csv(age_dir)
hosp_frame   = pd.read_csv(hosp_dir)
names_frame  = pd.read_csv(names_dir)

framedict = {"Deaths" : deaths_frame, 
             "Vaccination" : vacc_frame,
             "Demographics" : demog_frame, 
             "Age" : age_frame,
             "Hospitals" : hosp_frame}

for key in framedict:
    print(key," has ",len(framedict[key])," entries")
    
#longer numbers represent the entries of daily entries. Entries around 640 contain district entries.

Deaths  has  263862  entries
Vaccination  has  210103  entries
Demographics  has  643  entries
Age  has  617  entries
Hospitals  has  552  entries


In [174]:
print("The keys of the dataframes are: \n")
for key, value in framedict.items():
    print(key, ":")
    print('length: ',len(framedict[key]))
    print(value.keys())

The keys of the dataframes are: 

Deaths :
length:  263862
Index(['pc11_state_id', 'pc11_district_id', 'date', 'total_cases',
       'total_deaths'],
      dtype='object')
Vaccination :
length:  210103
Index(['lgd_state_id', 'lgd_state_name', 'lgd_district_id',
       'lgd_district_name', 'date', 'total_individuals_registered',
       'total_sessions_conducted', 'total_sites', 'total_covaxin',
       'total_covishield', 'first_dose_admin', 'second_dose_admin', 'male_vac',
       'female_vac', 'trans_vac', 'state', 'district', 'bad_flg_covishield',
       'bad_flg_covaxin'],
      dtype='object')
Demographics :
length:  643
Index(['pc11_state_id', 'pc11_district_id', 'pc11_urb_share', 'pc11_slum_pop',
       'pc11_vd_area', 'pc11_td_area', 'pc11_tot_area', 'pc11_pop_dens',
       'pc11r_pca_tot_p', 'pc11u_pca_tot_p', 'pc11_pca_tot_p',
       'pc11r_pca_tot_m', 'pc11u_pca_tot_m', 'pc11_pca_tot_m',
       'pc11r_pca_tot_f', 'pc11u_pca_tot_f', 'pc11_pca_tot_f',
       'pc11r_pca_p_lit', 'p

In [175]:
## Cleaning Hospital data

# Some districts are missing in the hosp data. This makes sure they are in there as NaN
distr_col = demog_frame['pc11_district_id'] 
hosp_frame = pd.merge(hosp_frame, distr_col, on = 'pc11_district_id', how = 'right')

df = hosp_frame
# We only have to rename our district code to 'censuscode' 
df = df.rename(columns={"pc11_district_id": "censuscode"})

# Andd drop wierd 0th and 999th district 
# these district codes represent not attributed cases and unclear cases
df = df[df['censuscode'] != 999]
df = df[df['censuscode'] != 0]


In [176]:
# This is what the new frame looks like
hosp_frame = df
hosp_frame.head()
print("The size of the hosp_frame is ",len(hosp_frame))

Unnamed: 0,pc11_state_id,censuscode,dlhs4_dh_beds,dlhs4_dh_count,dlhs4_dh_staff,dlhs4_dh_icu_beds,dlhs4_chc_beds,dlhs4_chc_count,dlhs4_chc_staff,dlhs4_chc_beds_ven,dlhs4_phc_beds,dlhs4_phc_count,dlhs4_phc_staff,dlhs4_phc_pop,dlhs4_phc_beds_oxy,pc11_pca_tot_p,dlhs4_phc_mult,dlhs4_total_beds,dlhs4_total_staff,dlhs4_total_facilities
1,,1,,,,,,,,,,,,,,,,,,
2,,2,,,,,,,,,,,,,,,,,,
3,,3,,,,,,,,,,,,,,,,,,
4,,4,,,,,,,,,,,,,,,,,,
5,,5,,,,,,,,,,,,,,,,,,


The size of the hosp_frame is  640


In [177]:
## Cleaning up Demographic data

df = demog_frame

# First, we standardize the district id to the name "censuscode"
df = df.rename(columns={'pc11_district_id' : 'censuscode'})

# Second, we check the length and whether it adds up to 640 districts.
print("The length of the dataframe is", len(df))

## This length implicates that there are wierd censuscodes or duplicates.

print("There are", df['censuscode'].duplicated().sum(), "district duplicates")

#This creates a frame that shows the duplicates
df.loc[df['censuscode'].duplicated(),:]

## Both these districts have censuscode 0, which doesn't work with our list.

print("Lets yeet these doubles out")

# Lets yeet them out
df = df[df['censuscode'] != 0]

# Analysis to see if we succeeded
print("The length of the dataframe is", len(df))
print("There are now", df['censuscode'].duplicated().sum(), "district duplicates")
demog_frame = df




The length of the dataframe is 643
There are 2 district duplicates


Unnamed: 0,pc11_state_id,censuscode,pc11_urb_share,pc11_slum_pop,pc11_vd_area,pc11_td_area,pc11_tot_area,pc11_pop_dens,pc11r_pca_tot_p,pc11u_pca_tot_p,...,pc11_pca_f_sc,pc11r_pca_p_st,pc11u_pca_p_st,pc11_pca_p_st,pc11r_pca_m_st,pc11u_pca_m_st,pc11_pca_m_st,pc11r_pca_f_st,pc11u_pca_f_st,pc11_pca_f_st
90,7,0,,0.0,0.0,604.01,604.01001,,,,...,,,,,,,,,,
419,23,0,,0.0,0.0,19.9,19.9,,,,...,,,,,,,,,,


Lets yeet these doubles out
The length of the dataframe is 640
There are now 0 district duplicates


In [178]:
#Framedict['Deaths'] is the frame of casualties and cases per day.
framedict['Deaths'].head()


Unnamed: 0,pc11_state_id,pc11_district_id,date,total_cases,total_deaths
0,1,1,30jan2020,0.0,0.0
1,1,1,02feb2020,0.0,0.0
2,1,1,03feb2020,0.0,0.0
3,1,1,02mar2020,0.0,0.0
4,1,1,03mar2020,0.0,0.0


In [179]:
df = framedict['Deaths']
# We only have to rename our district code to 'censuscode' 
df = df.rename(columns={"pc11_district_id": "censuscode"})

#andd drop wierd 999th district
df = df[df['censuscode'] != 999]
df.keys()
df = df[df['censuscode'] != 0]

deaths_frame = df


Index(['pc11_state_id', 'censuscode', 'date', 'total_cases', 'total_deaths'], dtype='object')

In [180]:
len(df)
df.head()

250710

Unnamed: 0,pc11_state_id,censuscode,date,total_cases,total_deaths
0,1,1,30jan2020,0.0,0.0
1,1,1,02feb2020,0.0,0.0
2,1,1,03feb2020,0.0,0.0
3,1,1,02mar2020,0.0,0.0
4,1,1,03mar2020,0.0,0.0


In [181]:
# We can use this dataframe to filter on specific date that we are interested in.

def stripframe(df, date):
    if type(date) != list:
        date = [date]
    retdict = {}
    for i in date:
        newframe = df[df['date'] == i] 
        # This dataframe has to be pivotted to conform to the district name on the left.
        df2 = newframe.pivot_table(index = newframe.index, columns = [])
    
        retdict.update({i : df2})
    return retdict

# this code is an example on how to extract dates from this dataframe.

dates = ['28mar2021', '04apr2021', '13apr2021']
#dates = '28mar2021'

datedict = stripframe(df, dates)    

#check if the data has been parsed correctly.
print('Lengths of the dataframes are: ', [(i, len(j)) for i,j in datedict.items()])
overview = [print('\n'+str(i)+'\n',j) for i,j in datedict.items()]


Lengths of the dataframes are:  [('28mar2021', 610), ('04apr2021', 610), ('13apr2021', 610)]

28mar2021
         censuscode  pc11_state_id  total_cases  total_deaths
394              1              1       5791.0          97.0
805              2              1       8145.0         120.0
1216             3              1       7828.0          86.0
1627             4              1       2146.0          44.0
2038             5              1       2542.0          25.0
...            ...            ...          ...           ...
261379         635             34      32325.0         550.0
261790         636             34       2303.0          10.0
262201         637             34       4331.0          75.0
263023         639             35          0.0           0.0
263434         640             35          0.0           0.0

[610 rows x 4 columns]

04apr2021
         censuscode  pc11_state_id  total_cases  total_deaths
401              1              1       5857.0          97.0
812  

In [182]:
# Based on the length, we can see that there are still 30
# missing entries. Lets find them. 

# This function can check what items are missing in a series. In our case: 1-640.
def missingcheck(column : 'that needs checking',
                 length : 'length of the array that you want to check'):
    
    array = np.arange(1,(length),1)
    missing = []
    for item in array:
        if item not in column.unique():
            #print("nice")
            missing.append(item)
    print(len(missing), "values are missing.")
    return missing

In [183]:
# This loop checks if the missing values are constant.
unique = []
for key, item in datedict.items():
    unique.append(missingcheck(datedict[key]['censuscode'], 641))
    
unique[0] == unique[1] == unique[2]
print(unique[1])

30 values are missing.
30 values are missing.
30 values are missing.


True

[95, 241, 242, 243, 244, 272, 273, 274, 275, 276, 279, 280, 300, 303, 305, 306, 308, 309, 310, 311, 314, 315, 317, 319, 320, 324, 325, 326, 586, 638]


In [184]:
## Making a list of all the districts with codes: names_frame

# We made this list based on an import.
df = names_frame
df.keys()

#df['pc11_district_id']
df = df.drop(columns=['shrid'])
df = df.drop_duplicates(keep='first', subset = ['pc11_district_id'])
print("Being:",missingcheck(df['pc11_district_id'], 641))
df = df.rename(columns={"pc11_district_id": "censuscode"})
names_frame = df
missingcheck(names_frame['censuscode'],641)

Index(['pc11_state_id', 'pc11_state_name', 'pc11_district_id',
       'pc11_district_name', 'shrid'],
      dtype='object')

18 values are missing.
Being: [90, 91, 92, 93, 94, 95, 96, 97, 98, 474, 492, 493, 518, 519, 535, 536, 537, 572]
18 values are missing.


[90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 474,
 492,
 493,
 518,
 519,
 535,
 536,
 537,
 572]

In [185]:
framedict = {"Deaths" : deaths_frame, 
             "Vaccination" : vacc_frame,
             "Demographics" : demog_frame, 
             "Age" : age_frame,
             "Hospitals" : hosp_frame,
             "Names" : names_frame}

In [186]:
datedict.keys()

dict_keys(['28mar2021', '04apr2021', '13apr2021'])

In [187]:
steps = np.arange(0,90,5)

steps_frame = pd.DataFrame()
for i in steps:
    y ='age_'+str(i)+'_t_share'
    steps_frame[y] = age_frame[y]*i
    

    
steps_frame['total'] = steps_frame.sum(axis=1)
avg = steps_frame['total']/steps_frame.sum().max()


In [193]:
df = age_frame
df = df.rename(columns={"pc11_district_id": "censuscode"})
df['average_age'] = steps_frame['total']

missing = missingcheck(df['censuscode'], 641)
print(missing)

age_frame = df
age_frame

23 values are missing.
[496, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 634, 635, 636, 637, 638, 639, 640]


Unnamed: 0,pc11_state_id,censuscode,sector_present,pc11_pca_tot_u,pc11_pca_tot_r,pc11_pca_tot_t,age_0_r_share,age_5_r_share,age_10_r_share,age_15_r_share,...,age_80_r,age_80_u,age_80_t,age_80_t_share,age_85_r,age_85_u,age_85_t,age_85_t_share,average age,average_age
0,1,1,3 urban and rural,104729,765625,870354,0.114430,0.137691,0.127619,0.105311,...,4110,578,4688,0.005386,2505,281,2786,0.003201,23.130876,23.130876
1,1,2,3 urban and rural,97912,655833,753745,0.124514,0.145426,0.119529,0.102073,...,2797,379,3176,0.004214,1461,204,1665,0.002209,22.392261,22.392261
2,1,3,3 urban and rural,45671,87816,133487,0.068943,0.086203,0.090310,0.093675,...,820,258,1078,0.008076,490,285,775,0.005806,29.451669,29.451669
3,1,4,3 urban and rural,16338,124464,140802,0.083229,0.118205,0.112861,0.111868,...,458,66,524,0.003722,376,62,438,0.003111,24.916550,24.916550
4,1,5,3 urban and rural,38630,438205,476835,0.098316,0.138034,0.130590,0.105982,...,2786,282,3068,0.006434,1989,149,2138,0.004484,23.933174,23.933174
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612,33,629,3 urban and rural,1539802,330572,1870374,0.052769,0.070436,0.079356,0.080769,...,2413,12239,14652,0.007834,1421,7495,8916,0.004767,32.183590,32.183590
613,33,630,3 urban and rural,260912,1245931,1506843,0.060513,0.083318,0.092509,0.094854,...,6636,1590,8226,0.005459,3620,891,4511,0.002994,28.769908,28.769908
614,33,631,3 urban and rural,428363,1451446,1879809,0.064731,0.084366,0.094125,0.096558,...,8766,2031,10797,0.005744,4866,1235,6101,0.003246,28.080297,28.080297
615,33,632,3 urban and rural,2618940,839105,3458045,0.045554,0.061322,0.073881,0.077948,...,7526,19180,26706,0.007723,4394,12390,16784,0.004854,32.493332,32.493332


In [189]:
# Now we are setting up champions frame using the codes from interesting_frame
champion = pd.merge(datedict['13apr2021'], demog_frame[['censuscode','pc11_pca_tot_p']], on = 'censuscode', how = 'left')
champion['deaths per 100.000'] = champion['total_deaths']/champion['pc11_pca_tot_p']*1e5
champion['cases per 100.000'] = champion['total_cases']/champion['pc11_pca_tot_p']*1e5

champion

Unnamed: 0,censuscode,pc11_state_id,total_cases,total_deaths,pc11_pca_tot_p,deaths per 100.000,cases per 100.000
0,1,1,6057.0,98.0,870354.0,11.259786,695.923728
1,2,1,8678.0,123.0,753745.0,16.318516,1151.317753
2,3,1,8705.0,87.0,133487.0,65.174886,6521.234277
3,4,1,2200.0,44.0,140802.0,31.249556,1562.477806
4,5,1,2589.0,27.0,476835.0,5.662336,542.955110
...,...,...,...,...,...,...,...
605,635,34,35037.0,558.0,950289.0,58.718979,3686.983644
606,636,34,2448.0,12.0,41816.0,28.697149,5854.218481
607,637,34,5231.0,80.0,200222.0,39.955649,2612.600014
608,639,35,0.0,0.0,105597.0,0.000000,0.000000


In [190]:
interesting_frame

Unnamed: 0,label,dataset,code,folder,remarks
0,Literacy,pc11_demographics_subdistrict,pc11_pca_p_lit,demography,
1,population density,pc11_demographics_subdistrict,pc11_pop_dens,demography,
2,slum population,pc11_demographics_subdistrict,pc11_slum_pop,demography,only for urban
3,Urban population share,pc11_demographics_subdistrict,pc11_urb_share,demography,
4,covid infections,covid_infected_deaths,total_cases,covid,
5,covid deaths,covid_infected_deaths,total_deaths,covid,
6,dates,covid_infected_deaths,date,covid,
7,district id,covid_infected_deaths,lgd_district_id,covid,
8,site vaccination progress,covid_vaccination,total_sites,covid,
9,first dose vaccination,covid_vaccination,first_dose_admin,covid,


In [194]:
# This section takes the interesting data (hardcoded, sorry)

champion['Literacy'] =                            demog_frame['pc11_pca_p_lit']
champion['population density'] =                  demog_frame['pc11_pop_dens']
champion['slum population'] =                     demog_frame['pc11_slum_pop']
champion['Urban population share'] =              demog_frame['pc11_urb_share']
champion['Beds (IC)'] =                           hosp_frame['dlhs4_dh_icu_beds']
champion['Beds (total)'] =                        hosp_frame['dlhs4_dh_icu_beds']
champion['Care facilities (hospitals/clinics)'] = hosp_frame['dlhs4_total_facilities']
champion['average age'] =                         age_frame['average_age']

champion.keys()

Index(['censuscode', 'pc11_state_id', 'total_cases', 'total_deaths',
       'pc11_pca_tot_p', 'deaths per 100.000', 'cases per 100.000', 'Literacy',
       'population density', 'slum population', 'Urban population share',
       'Beds (IC)', 'Beds (total)', 'Care facilities (hospitals/clinics)',
       'average age'],
      dtype='object')