#### Importing important data
In this code I am trying to import the data as efficient as possible.
The data I am particulary interested in can be found in the google sheet document "interesting data"
The categories that are defined so far are:
###### Demographics
1. Literacy
2. population density
3. slum population
4. Urban population share
###### Deaths
5. covid infections
6. covid deaths
7. dates
8. district id
###### Vaccination
9. site vaccination progress
10. first dose vaccination
11. dates
12. district id
###### Age
13. age per 5 year categories to make a map of where older people live

In [1]:
import pandas as pd
import numpy as np

In [2]:
# We need to be able to access the right folder. I put mine in the following folders. This can be adjusted easily
# Keep in mind that the 'r' in front of the string converts
# the string to a 'raw' string, bc of conflicts with \ and " ' characters

# Base directory
base_dir =       r"C:\Users\danie\OneDrive\Bureaublad\Coding\EPA introduction to datascience\Intro to datascience project"

# This section specifies the file name directories that I want to use
covid_folder =  r"\covid_data\covid\csv"
demog_folder =  r"\covid_data\demography\csv"

# This section identifies the actual files
deaths_file =   r"\covid_infected_deaths_pc11.csv"
vacc_file =     r"\covid_vaccination.csv" #we cant use this because it contains new district numbering
demog_file =    r"\pc11_demographics_district.csv"
age_file =      r"\age_bins_district_t_pc11.csv"

deaths_dir = base_dir + covid_folder + deaths_file
vacc_dir =   base_dir + covid_folder + vacc_file #NA
demog_dir =  base_dir + demog_folder + demog_file
age_dir =    base_dir + demog_folder + age_file

dirlist = [deaths_dir, vacc_dir, demog_dir, age_dir]

In [33]:
deaths_frame = pd.read_csv(deaths_dir)
vacc_frame   = pd.read_csv(vacc_dir)
demog_frame  = pd.read_csv(demog_dir)
age_frame    = pd.read_csv(age_dir)

framedict = {"Deaths" : deaths_frame, 
             "Vaccination" : vacc_frame,
             "Demographics" : demog_frame, 
             "Age" : age_frame}

for key in framedict:
    print(key," has ",len(framedict[key])," entries")
    
#longer numbers represent the entries of daily entries. Entries around 640 contain district entries.

Deaths  has  263862  entries
Vaccination  has  210103  entries
Demographics  has  643  entries
Age  has  617  entries


In [4]:
print("The keys of the dataframes are: \n")
for key, value in framedict.items():
    print(key, ":")
    print('length: ',len(framedict[key]))
    print(value.keys())

The keys of the dataframes are: 

Deaths :
length:  263862
Index(['pc11_state_id', 'pc11_district_id', 'date', 'total_cases',
       'total_deaths'],
      dtype='object')
Vaccination :
length:  210103
Index(['lgd_state_id', 'lgd_state_name', 'lgd_district_id',
       'lgd_district_name', 'date', 'total_individuals_registered',
       'total_sessions_conducted', 'total_sites', 'total_covaxin',
       'total_covishield', 'first_dose_admin', 'second_dose_admin', 'male_vac',
       'female_vac', 'trans_vac', 'state', 'district', 'bad_flg_covishield',
       'bad_flg_covaxin'],
      dtype='object')
Demographics :
length:  643
Index(['pc11_state_id', 'pc11_district_id', 'pc11_urb_share', 'pc11_slum_pop',
       'pc11_vd_area', 'pc11_td_area', 'pc11_tot_area', 'pc11_pop_dens',
       'pc11r_pca_tot_p', 'pc11u_pca_tot_p', 'pc11_pca_tot_p',
       'pc11r_pca_tot_m', 'pc11u_pca_tot_m', 'pc11_pca_tot_m',
       'pc11r_pca_tot_f', 'pc11u_pca_tot_f', 'pc11_pca_tot_f',
       'pc11r_pca_p_lit', 'p

In [5]:
# Here we filter the dataframes on interesting variables that we want to use
interesting_dir = base_dir + r"\variablecodes.xlsm"
interesting_frame = pd.read_excel(interesting_dir)
interesting_frame

Unnamed: 0,label,dataset,code,folder,remarks
0,Literacy,Demographics,pc11_pca_p_lit,demography,
1,population density,Demographics,pc11_pop_dens,demography,
2,slum population,Demographics,pc11_slum_pop,demography,only for urban
3,Urban population share,Demographics,pc11_urb_share,demography,
4,covid infections,Deaths,total_cases,covid,
5,covid deaths,Deaths,total_deaths,covid,
6,dates,Deaths,date,covid,
7,district id,Deaths,lgd_district_id,covid,
8,site vaccination progress,Vaccination,total_sites,covid,
9,first dose vaccination,Vaccination,first_dose_admin,covid,


In [6]:
#Framedict['Deaths'] is the frame of casualties per day.
framedict['Deaths'].head()


Unnamed: 0,pc11_state_id,pc11_district_id,date,total_cases,total_deaths
0,1,1,30jan2020,0.0,0.0
1,1,1,02feb2020,0.0,0.0
2,1,1,03feb2020,0.0,0.0
3,1,1,02mar2020,0.0,0.0
4,1,1,03mar2020,0.0,0.0


In [150]:
df = framedict['Deaths']
# We only have to rename our district code to 'censuscode' 
df = df.rename(columns={"pc11_district_id": "censuscode"})

#andd drop wierd 999th district
df = df[df['censuscode'] != 999]


In [151]:
df

Unnamed: 0,pc11_state_id,censuscode,date,total_cases,total_deaths
0,1,1,30jan2020,0.0,0.0
1,1,1,02feb2020,0.0,0.0
2,1,1,03feb2020,0.0,0.0
3,1,1,02mar2020,0.0,0.0
4,1,1,03mar2020,0.0,0.0
...,...,...,...,...,...
263446,35,640,09apr2021,0.0,0.0
263447,35,640,10apr2021,0.0,0.0
263448,35,640,11apr2021,0.0,0.0
263449,35,640,12apr2021,0.0,0.0


In [149]:
# We can use this dataframe to filter on specific dates.
# This code does that:
def stripframe(df, date):
    if type(date) != list:
        date = [date]
    retdict = {}
    for i in date:
        newframe = df[df['date'] == i] 
        # This dataframe has to be pivotted to conform to the district name on the left.
        df2 = newframe.pivot_table(index = newframe.index, columns = [])
    
        retdict.update({i : df2})
    return retdict

# this code is an example on how to extract dates from this dataframe.

dates = ['28mar2021', '04apr2021', '11apr2021']
#dates = '28mar2021'

datedict = stripframe(df, dates)    

#check if the data has been parsed correctly.
print('Lengths of the dataframes are: ', [(i, len(j)) for i,j in datedict.items()])
overview = [print('\n'+str(i)+'\n',j) for i,j in datedict.items()]


Lengths of the dataframes are:  [('28mar2021', 612), ('04apr2021', 612), ('11apr2021', 612)]

28mar2021
         censuscode  pc11_state_id  total_cases  total_deaths
394              1              1       5791.0          97.0
805              2              1       8145.0         120.0
1216             3              1       7828.0          86.0
1627             4              1       2146.0          44.0
2038             5              1       2542.0          25.0
...            ...            ...          ...           ...
261379         635             34      32325.0         550.0
261790         636             34       2303.0          10.0
262201         637             34       4331.0          75.0
263023         639             35          0.0           0.0
263434         640             35          0.0           0.0

[612 rows x 4 columns]

04apr2021
         censuscode  pc11_state_id  total_cases  total_deaths
401              1              1       5857.0          97.0
812  

Unnamed: 0,pc11_state_id,pc11_district_id,pc11_urb_share,pc11_slum_pop,pc11_vd_area,pc11_td_area,pc11_tot_area,pc11_pop_dens,pc11r_pca_tot_p,pc11u_pca_tot_p,...,pc11_pca_f_sc,pc11r_pca_p_st,pc11u_pca_p_st,pc11_pca_p_st,pc11r_pca_m_st,pc11u_pca_m_st,pc11_pca_m_st,pc11r_pca_f_st,pc11u_pca_f_st,pc11_pca_f_st
0,1,0,,0.000000,0.000000,316.3100,316.310000,,,,...,,,,,,,,,,
1,1,1,0.120329,20475.000000,615.953702,47.3400,663.293700,1312.16980,765625.0,104729.0,...,2.0,68491.0,1861.0,70352.0,35855.0,1058.0,36913.0,32636.0,803.0,33439.0
2,1,2,0.129901,37610.903809,672.810101,43.6800,716.490110,1051.99630,655833.0,97912.0,...,25.0,23293.0,619.0,23912.0,12025.0,358.0,12383.0,11268.0,261.0,11529.0
3,1,3,0.342138,7763.000000,369.884999,24.0100,393.894990,338.88980,87816.0,45671.0,...,44.0,73789.0,22068.0,95857.0,36318.0,11225.0,47543.0,37471.0,10843.0,48314.0
4,1,4,0.116035,0.000000,186.618100,2.1400,188.758100,745.93884,124464.0,16338.0,...,6.0,110840.0,11496.0,122336.0,56693.0,5959.0,62652.0,54147.0,5537.0,59684.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
638,34,636,1.000000,4059.000000,0.000000,9.0000,9.000000,4646.22220,0.0,41816.0,...,67.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
639,34,637,0.489966,26890.000000,121.650500,38.3000,159.950500,1251.77480,102120.0,98102.0,...,18142.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
640,35,638,0.000000,0.000000,69.906901,0.0000,69.906898,527.01520,36842.0,0.0,...,0.0,23681.0,0.0,23681.0,12198.0,0.0,12198.0,11483.0,0.0,11483.0
641,35,639,0.025957,0.000000,401.905198,5.9500,407.855190,258.90808,102856.0,2741.0,...,0.0,748.0,10.0,758.0,410.0,5.0,415.0,338.0,5.0,343.0
