#### Importing important data
In this code I am trying to import the data as efficient as possible.
The data I am particulary interested in can be found in the google sheet document "interesting data"
The categories that are defined so far are:

###### Demographics
1. Literacy
2. population density
3. slum population
4. Urban population share

###### Hospital data
5. Beds (IC)
6. Beds (total) 	
7. Care facilities (hospitals/clinics) 	

###### Age
8. average age per district

###### Deaths
9. covid infections
10. covid deaths

(see also interesting_frame)

In [84]:
import pandas as pd
import numpy as np

In [85]:
#As always we have to specify to jupyter that we want to see all the outputs.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.options.display.max_rows = 20


In [86]:
# We need to be able to access the right folder. I put mine in the following folders. This can be adjusted easily
# Keep in mind that the 'r' in front of the string converts
# the string to a 'raw' string, bc of conflicts with \ and " ' characters

# Base directory
base_dir =       r"./covid_data"

# This section specifies the file name directories that I want to use
covid_folder =  r"/covid/csv"
demog_folder =  r"/demography/csv"
hosp_folder =  r"/hospitals/csv"

# This section identifies the actual files
deaths_file =   r"/covid_infected_deaths_pc11.csv"
#vacc_file =     r"/covid_vaccination.csv" #we cant use this because it contains new district numbering
demog_file =    r"/pc11_demographics_district.csv"
age_file =      r"/age_bins_district_t_pc11.csv"
hosp_file =     r"/dlhs4_hospitals_dist_pc11.csv"
names_file =     r"/shrug_pc11_district_key.csv"

deaths_dir = base_dir + covid_folder + deaths_file
#vacc_dir =   base_dir + covid_folder + vacc_file #NA bc it does not use the standard 2011 district listing
demog_dir =  base_dir + demog_folder + demog_file
age_dir =    base_dir + demog_folder + age_file
hosp_dir =   base_dir + hosp_folder + hosp_file
names_dir =  base_dir + names_file

dirlist = [deaths_dir, demog_dir, age_dir, hosp_dir, names_dir]


In [87]:
# Here is the overview of interesting variables that we want to use for our first analysis.
interesting_dir = base_dir + r"/variablecodes.xlsx"
interesting_frame = pd.read_excel(interesting_dir)
interesting_frame['code']

0                    pc11_pca_p_lit
1                     pc11_pop_dens
2                     pc11_slum_pop
3                    pc11_urb_share
4                 dlhs4_dh_icu_beds
5                  dlhs4_total_beds
6            dlhs4_total_facilities
7    age_i_t [for i in all columns]
8                       total_cases
9                      total_deaths
Name: code, dtype: object

In [88]:
deaths_frame = pd.read_csv(deaths_dir)
#vacc_frame   = pd.read_csv(vacc_dir)
demog_frame  = pd.read_csv(demog_dir)
age_frame    = pd.read_csv(age_dir)
hosp_frame   = pd.read_csv(hosp_dir)
names_frame  = pd.read_csv(names_dir)

framedict = {"Deaths" : deaths_frame, 
             #"Vaccination" : vacc_frame,
             "Demographics" : demog_frame, 
             "Age" : age_frame,
             "Hospitals" : hosp_frame}

for key in framedict:
    print(key," has ",len(framedict[key])," entries")
    
#longer numbers represent the entries of daily entries. Entries around 640 contain district entries.

Deaths  has  263862  entries
Demographics  has  643  entries
Age  has  617  entries
Hospitals  has  552  entries


In [89]:
print("The keys of the dataframes are: \n")
for key, value in framedict.items():
    print(key, ":")
    print('length: ',len(framedict[key]))
    print(value.keys())

The keys of the dataframes are: 

Deaths :
length:  263862
Index(['pc11_state_id', 'pc11_district_id', 'date', 'total_cases',
       'total_deaths'],
      dtype='object')
Demographics :
length:  643
Index(['pc11_state_id', 'pc11_district_id', 'pc11_urb_share', 'pc11_slum_pop',
       'pc11_vd_area', 'pc11_td_area', 'pc11_tot_area', 'pc11_pop_dens',
       'pc11r_pca_tot_p', 'pc11u_pca_tot_p', 'pc11_pca_tot_p',
       'pc11r_pca_tot_m', 'pc11u_pca_tot_m', 'pc11_pca_tot_m',
       'pc11r_pca_tot_f', 'pc11u_pca_tot_f', 'pc11_pca_tot_f',
       'pc11r_pca_p_lit', 'pc11u_pca_p_lit', 'pc11_pca_p_lit',
       'pc11r_pca_m_lit', 'pc11u_pca_m_lit', 'pc11_pca_m_lit',
       'pc11r_pca_f_lit', 'pc11u_pca_f_lit', 'pc11_pca_f_lit',
       'pc11r_pca_p_sc', 'pc11u_pca_p_sc', 'pc11_pca_p_sc', 'pc11r_pca_m_sc',
       'pc11u_pca_m_sc', 'pc11_pca_m_sc', 'pc11r_pca_f_sc', 'pc11u_pca_f_sc',
       'pc11_pca_f_sc', 'pc11r_pca_p_st', 'pc11u_pca_p_st', 'pc11_pca_p_st',
       'pc11r_pca_m_st', 'pc11u_pca_m

In [90]:
# Based on some of the lengths, we can see that there are still 
# missing entries. Lets find them. 

# This function can check what items are missing in a series. In our case: 1-640.
def missingcheck(column : 'that needs checking',
                 length : 'length of the array that you want to check'):
    
    array = np.arange(1,(length),1)
    missing = []
    for item in array:
        if item not in column.unique():
            #print("nice")
            missing.append(item)
    print(len(missing), "values are missing.")
    return missing

In [91]:
##Cleaning up names_frame


# Making a list of all the districts with codes: names_frame

# We made this list based on an import.
df = names_frame
df.keys()

#Dropping the shrid column, duplicates and na rows
df = df.drop(columns=['shrid'])
df = df.drop_duplicates(keep='first', subset = ['pc11_district_id'])
df = df.dropna(subset=['pc11_district_id'])

print("Being:",missingcheck(df['pc11_district_id'], 641))
df = df.rename(columns={"pc11_district_id": "censuscode"})
names_frame = df
print("Being:",missingcheck(names_frame['censuscode'],641))

# To fill the missing numbers we have found the following data manually:

# This is the data
censuscode = [90,91,92,93,94,95,96,97,98,474,492,493,518,519,535,536,537,572]
district = ['delhi (north west)','delhi (north)','delhi (north east)','delhi (east)',
            'delhi (new delhi)', 'delhi (central)','delhi (west)','delhi (south west)',
            'delhi (north east)','ahmadabad','surat','tapi','unknown','mumbai','medak',
            'hyderabad','rangareddy','bangalore']

# Here we make a df, to concat later on
missing_frame = pd.DataFrame()
missing_frame['censuscode'] = censuscode
missing_frame['pc11_district_name'] = district

names_frame = pd.concat([names_frame,missing_frame])
names_frame = names_frame.sort_values(by=['censuscode'])

print("Cleaning Up..")
print("Now",missingcheck(names_frame['censuscode'], 641))


Index(['pc11_state_id', 'pc11_state_name', 'pc11_district_id',
       'pc11_district_name', 'shrid'],
      dtype='object')

18 values are missing.
Being: [90, 91, 92, 93, 94, 95, 96, 97, 98, 474, 492, 493, 518, 519, 535, 536, 537, 572]
18 values are missing.
Being: [90, 91, 92, 93, 94, 95, 96, 97, 98, 474, 492, 493, 518, 519, 535, 536, 537, 572]
Cleaning Up..
0 values are missing.
Now []


In [92]:
## Cleaning Hospital data

df = hosp_frame
# We only have to rename our district code to 'censuscode' 
df = df.rename(columns={"pc11_district_id": "censuscode"})

# Some districts are missing in the hosp data. This makes sure they are in there as NaN
df = pd.merge(df, names_frame['censuscode'], on = 'censuscode', how = 'right')

# Andd drop wierd 0th and 999th district 
# these district codes represent not attributed cases and unclear cases
df = df[df['censuscode'] != 999]
df = df[df['censuscode'] != 0]
df = df.sort_values(by=['censuscode'])

In [93]:
# This is what the new frame looks like
hosp_frame = df
hosp_frame
print("The size of the hosp_frame is ",len(hosp_frame))

Unnamed: 0,pc11_state_id,censuscode,dlhs4_dh_beds,dlhs4_dh_count,dlhs4_dh_staff,dlhs4_dh_icu_beds,dlhs4_chc_beds,dlhs4_chc_count,dlhs4_chc_staff,dlhs4_chc_beds_ven,dlhs4_phc_beds,dlhs4_phc_count,dlhs4_phc_staff,dlhs4_phc_pop,dlhs4_phc_beds_oxy,pc11_pca_tot_p,dlhs4_phc_mult,dlhs4_total_beds,dlhs4_total_staff,dlhs4_total_facilities
0,,1.0,,,,,,,,,,,,,,,,,,
1,,2.0,,,,,,,,,,,,,,,,,,
2,,3.0,,,,,,,,,,,,,,,,,,
3,,4.0,,,,,,,,,,,,,,,,,,
4,,5.0,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
635,34.0,636.0,171.0,1.0,1429.0,21.0,30.0,1.0,181.0,0.0,1.0,1.0,119.0,2810.0,0.0,41816.0,14.881139,215.88113,3380.85550,16.881138
636,34.0,637.0,0.0,0.0,0.0,,62.0,3.0,63.0,0.0,30.0,7.0,95.0,67220.0,30.0,200222.0,2.978608,151.35823,345.96771,23.850254
637,35.0,638.0,53.0,1.0,41.0,0.0,5.0,1.0,56.0,0.0,61.0,4.0,74.0,13991.0,51.0,36842.0,2.633264,218.62912,291.86157,12.533057
638,35.0,639.0,70.0,1.0,81.0,0.0,145.0,2.0,81.0,0.0,90.0,7.0,104.0,41395.0,90.0,105597.0,2.550960,444.58643,427.29987,20.856722


The size of the hosp_frame is  640


In [94]:
## Cleaning up Demographic data

df = demog_frame

# First, we standardize the district id to the name "censuscode"
df = df.rename(columns={'pc11_district_id' : 'censuscode'})

# Second, we check the length and whether it adds up to 640 districts.
print("The length of the dataframe is", len(df))

## This length implicates that there are wierd censuscodes or duplicates.

print("There are", df['censuscode'].duplicated().sum(), "district duplicates")

#This creates a frame that shows the duplicates
df.loc[df['censuscode'].duplicated(),:]

## Both these districts have censuscode 0, which doesn't work with our list.

print("Lets yeet these doubles out")

# Lets yeet them out
df = df[df['censuscode'] != 0]

# Analysis to see if we succeeded
print("The length of the dataframe is", len(df))
print("There are now", df['censuscode'].duplicated().sum(), "district duplicates")
df = df.sort_values(by=['censuscode'])
demog_frame = df




The length of the dataframe is 643
There are 2 district duplicates


Unnamed: 0,pc11_state_id,censuscode,pc11_urb_share,pc11_slum_pop,pc11_vd_area,pc11_td_area,pc11_tot_area,pc11_pop_dens,pc11r_pca_tot_p,pc11u_pca_tot_p,...,pc11_pca_f_sc,pc11r_pca_p_st,pc11u_pca_p_st,pc11_pca_p_st,pc11r_pca_m_st,pc11u_pca_m_st,pc11_pca_m_st,pc11r_pca_f_st,pc11u_pca_f_st,pc11_pca_f_st
90,7,0,,0.0,0.0,604.01,604.01001,,,,...,,,,,,,,,,
419,23,0,,0.0,0.0,19.9,19.9,,,,...,,,,,,,,,,


Lets yeet these doubles out
The length of the dataframe is 640
There are now 0 district duplicates


In [95]:
#Framedict['Deaths'] is the frame of casualties and cases per day.
framedict['Deaths'].head()


Unnamed: 0,pc11_state_id,pc11_district_id,date,total_cases,total_deaths
0,1,1,30jan2020,0.0,0.0
1,1,1,02feb2020,0.0,0.0
2,1,1,03feb2020,0.0,0.0
3,1,1,02mar2020,0.0,0.0
4,1,1,03mar2020,0.0,0.0


In [96]:
df = framedict['Deaths']
# We only have to rename our district code to 'censuscode' 
df = df.rename(columns={"pc11_district_id": "censuscode"})

#andd drop wierd 999th district
df = df[df['censuscode'] != 999]
df.keys()
df = df[df['censuscode'] != 0]
df = df.sort_values(by=['censuscode'])
deaths_frame = df


Index(['pc11_state_id', 'censuscode', 'date', 'total_cases', 'total_deaths'], dtype='object')

In [97]:
len(df)
df.head()

250710

Unnamed: 0,pc11_state_id,censuscode,date,total_cases,total_deaths
0,1,1,30jan2020,0.0,0.0
280,1,1,04dec2020,5244.0,85.0
279,1,1,03dec2020,5234.0,84.0
278,1,1,02dec2020,5224.0,84.0
277,1,1,01dec2020,5208.0,83.0


In [98]:
# We can use this dataframe to filter on specific date that we are interested in.

def stripframe(df, date):
    if type(date) != list:
        date = [date]
    retdict = {}
    for i in date:
        newframe = df[df['date'] == i] 
        # This dataframe has to be pivotted to conform to the district name on the left.
        df2 = newframe.pivot_table(index = newframe.index, columns = [])
    
        retdict.update({i : df2})
    return retdict

# this code is an example on how to extract dates from this dataframe.

In [99]:
dates = ['21mar2021','28mar2021', '04apr2021', '13apr2021']
#dates = '28mar2021'

datedict = stripframe(df, dates)    

#check if the data has been parsed correctly.
print('Lengths of the dataframes are: ', [(i, len(j)) for i,j in datedict.items()])
overview = [print('\n'+str(i)+'\n',j) for i,j in datedict.items()]


Lengths of the dataframes are:  [('21mar2021', 610), ('28mar2021', 610), ('04apr2021', 610), ('13apr2021', 610)]

21mar2021
         censuscode  pc11_state_id  total_cases  total_deaths
387              1              1       5735.0          97.0
798              2              1       8042.0         120.0
1209             3              1       7763.0          86.0
1620             4              1       2145.0          44.0
2031             5              1       2535.0          24.0
...            ...            ...          ...           ...
261372         635             34      31833.0         549.0
261783         636             34       2285.0          10.0
262194         637             34       4138.0          71.0
263016         639             35          0.0           0.0
263427         640             35          0.0           0.0

[610 rows x 4 columns]

28mar2021
         censuscode  pc11_state_id  total_cases  total_deaths
394              1              1       5791.0

In [100]:
# This loop checks if the missing values are constant.
unique = []
for key, item in datedict.items():
    unique.append(missingcheck(datedict[key]['censuscode'], 641))
    
unique[0] == unique[1] == unique[2]
print(unique[1])

30 values are missing.
30 values are missing.
30 values are missing.
30 values are missing.


True

[95, 241, 242, 243, 244, 272, 273, 274, 275, 276, 279, 280, 300, 303, 305, 306, 308, 309, 310, 311, 314, 315, 317, 319, 320, 324, 325, 326, 586, 638]


In [101]:
framedict = {"Deaths" : deaths_frame, 
             #"Vaccination" : vacc_frame,
             "Demographics" : demog_frame, 
             "Age" : age_frame,
             "Hospitals" : hosp_frame,
             "Names" : names_frame}

In [102]:
datedict.keys()

dict_keys(['21mar2021', '28mar2021', '04apr2021', '13apr2021'])

In [103]:
steps = np.arange(0,90,5)

steps_frame = pd.DataFrame()
for i in steps:
    y ='age_'+str(i)+'_t_share'
    steps_frame[y] = age_frame[y]*i
    
steps_frame['total'] = steps_frame.sum(axis=1)
avg = steps_frame['total']/steps_frame.sum().max()


In [104]:
df = age_frame
df = df.rename(columns={"pc11_district_id": "censuscode"})
df['average_age'] = steps_frame['total']

missing = missingcheck(df['censuscode'], 641)
print(missing)

age_frame = df
age_frame

23 values are missing.
[496, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 634, 635, 636, 637, 638, 639, 640]


Unnamed: 0,pc11_state_id,censuscode,sector_present,pc11_pca_tot_u,pc11_pca_tot_r,pc11_pca_tot_t,age_0_r_share,age_5_r_share,age_10_r_share,age_15_r_share,...,age_75_t_share,age_80_r,age_80_u,age_80_t,age_80_t_share,age_85_r,age_85_u,age_85_t,age_85_t_share,average_age
0,1,1,3 urban and rural,104729,765625,870354,0.114430,0.137691,0.127619,0.105311,...,0.005038,4110,578,4688,0.005386,2505,281,2786,0.003201,23.130876
1,1,2,3 urban and rural,97912,655833,753745,0.124514,0.145426,0.119529,0.102073,...,0.005164,2797,379,3176,0.004214,1461,204,1665,0.002209,22.392261
2,1,3,3 urban and rural,45671,87816,133487,0.068943,0.086203,0.090310,0.093675,...,0.012121,820,258,1078,0.008076,490,285,775,0.005806,29.451669
3,1,4,3 urban and rural,16338,124464,140802,0.083229,0.118205,0.112861,0.111868,...,0.006605,458,66,524,0.003722,376,62,438,0.003111,24.916550
4,1,5,3 urban and rural,38630,438205,476835,0.098316,0.138034,0.130590,0.105982,...,0.005492,2786,282,3068,0.006434,1989,149,2138,0.004484,23.933174
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612,33,629,3 urban and rural,1539802,330572,1870374,0.052769,0.070436,0.079356,0.080769,...,0.013420,2413,12239,14652,0.007834,1421,7495,8916,0.004767,32.183590
613,33,630,3 urban and rural,260912,1245931,1506843,0.060513,0.083318,0.092509,0.094854,...,0.007499,6636,1590,8226,0.005459,3620,891,4511,0.002994,28.769908
614,33,631,3 urban and rural,428363,1451446,1879809,0.064731,0.084366,0.094125,0.096558,...,0.007113,8766,2031,10797,0.005744,4866,1235,6101,0.003246,28.080297
615,33,632,3 urban and rural,2618940,839105,3458045,0.045554,0.061322,0.073881,0.077948,...,0.012255,7526,19180,26706,0.007723,4394,12390,16784,0.004854,32.493332


## Doing the Merge of Merges

This section merges all the frames into one frame with data that we are interested in.

In [105]:
interesting_frame


Unnamed: 0,label,dataset,code,folder,remarks
0,Literacy,pc11_demographics_subdistrict,pc11_pca_p_lit,demography,
1,population density,pc11_demographics_subdistrict,pc11_pop_dens,demography,
2,slum population,pc11_demographics_subdistrict,pc11_slum_pop,demography,only for urban
3,Urban population share,pc11_demographics_subdistrict,pc11_urb_share,demography,
4,Beds (IC),dlhs4_hospitals_dist_pc11,dlhs4_dh_icu_beds,hospitals,Eerste 22 districten missen
5,Beds (total),dlhs4_hospitals_dist_pc11,dlhs4_total_beds,hospitals,Eerste 22 districten missen
6,Care facilities (hospitals/clinics),dlhs4_hospitals_dist_pc11,dlhs4_total_facilities,hospitals,Eerste 22 districten missen
7,Average age per district,age_bins_district_t_pc11,age_i_t [for i in all columns],demography,
8,covid infections,covid_infected_deaths,total_cases,covid,
9,covid deaths,covid_infected_deaths,total_deaths,covid,


In [106]:
# First we have to set the frame up somehow

# We are setting up champions frame using the codes from interesting_frame
mergy =  names_frame[['censuscode','pc11_district_name','pc11_state_name','pc11_state_id']]
mergy


Unnamed: 0,censuscode,pc11_district_name,pc11_state_name,pc11_state_id
0,1.0,kupwara,jammu kashmir,1.0
370,2.0,badgam,jammu kashmir,1.0
845,3.0,leh ladakh,jammu kashmir,1.0
958,4.0,kargil,jammu kashmir,1.0
1085,5.0,punch,jammu kashmir,1.0
...,...,...,...,...
632672,636.0,mahe,puducherry,34.0
632639,637.0,karaikal,puducherry,34.0
632674,638.0,nicobars,andaman nicobar islands,35.0
632870,639.0,north middle andaman,andaman nicobar islands,35.0


In [107]:
# This section takes the interesting data (hardcoded, sorry)
# These are the first 8 interesting variables.

#This makes a frame that merges everythng the labels. The if block prevents double running.
if 'total_cases' not in mergy.keys():
    mergy = pd.merge(mergy, demog_frame[['censuscode','pc11_pca_p_lit','pc11_pop_dens','pc11_slum_pop','pc11_urb_share','pc11_pca_tot_p','pc11_tot_area']], on='censuscode', how = 'right')
    mergy = pd.merge(mergy, hosp_frame[['censuscode','dlhs4_dh_icu_beds','dlhs4_total_beds','dlhs4_total_facilities']], on='censuscode', how = 'right')
    mergy = pd.merge(mergy, age_frame[['censuscode','average_age']], on='censuscode', how = 'left')

    datedict_vars = datedict[str(list(datedict.keys())[-1])][['censuscode','total_cases','total_deaths']]
    mergy = pd.merge(mergy, datedict_vars, on = 'censuscode', how = 'left')
mergy


Unnamed: 0,censuscode,pc11_district_name,pc11_state_name,pc11_state_id,pc11_pca_p_lit,pc11_pop_dens,pc11_slum_pop,pc11_urb_share,pc11_pca_tot_p,pc11_tot_area,dlhs4_dh_icu_beds,dlhs4_total_beds,dlhs4_total_facilities,average_age,total_cases,total_deaths
0,1.0,kupwara,jammu kashmir,1.0,439654.0,1312.16980,20475.000000,0.120329,870354.0,663.293700,,,,23.130876,6057.0,98.0
1,2.0,badgam,jammu kashmir,1.0,335649.0,1051.99630,37610.903809,0.129901,753745.0,716.490110,,,,22.392261,8678.0,123.0
2,3.0,leh ladakh,jammu kashmir,1.0,93770.0,338.88980,7763.000000,0.342138,133487.0,393.894990,,,,29.451669,8705.0,87.0
3,4.0,kargil,jammu kashmir,1.0,86236.0,745.93884,0.000000,0.116035,140802.0,188.758100,,,,24.916550,2200.0,44.0
4,5.0,punch,jammu kashmir,1.0,261724.0,414.76535,0.000000,0.081013,476835.0,1149.650000,,,,23.933174,2589.0,27.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
635,636.0,mahe,puducherry,34.0,36470.0,4646.22220,4059.000000,1.000000,41816.0,9.000000,21.0,215.88113,16.881138,,2448.0,12.0
636,637.0,karaikal,puducherry,34.0,154916.0,1251.77480,26890.000000,0.489966,200222.0,159.950500,,151.35823,23.850254,,5231.0,80.0
637,638.0,nicobars,andaman nicobar islands,35.0,25332.0,527.01520,0.000000,0.000000,36842.0,69.906898,0.0,218.62912,12.533057,,,
638,639.0,north middle andaman,andaman nicobar islands,35.0,78683.0,258.90808,0.000000,0.025957,105597.0,407.855190,0.0,444.58643,20.856722,,0.0,0.0


In [108]:
champion = pd.DataFrame()
champion.keys()

#This section translates the labels. The if block prevents double running.
if 'Care facilities (hospitals/clinics)' not in champion.keys():
    labeldict ={'pc11_district_name':'Districtname',
                'pc11_state_id': 'State id',
                'pc11_pca_tot_p': 'population',
                'pc11_pca_p_lit': 'Literacy',
                'pc11_pop_dens': 'population density',
                'pc11_slum_pop':'slum population',
                'pc11_urb_share':'Urban population share',
                'dlhs4_dh_icu_beds':'Beds (IC)',
                'dlhs4_total_beds':'Beds (total)',
                'dlhs4_total_facilities':'Care facilities (hospitals/clinics)'}

champion = mergy.rename(columns=labeldict)

champion.head()

Index([], dtype='object')

Unnamed: 0,censuscode,Districtname,pc11_state_name,State id,Literacy,population density,slum population,Urban population share,population,pc11_tot_area,Beds (IC),Beds (total),Care facilities (hospitals/clinics),average_age,total_cases,total_deaths
0,1.0,kupwara,jammu kashmir,1.0,439654.0,1312.1698,20475.0,0.120329,870354.0,663.2937,,,,23.130876,6057.0,98.0
1,2.0,badgam,jammu kashmir,1.0,335649.0,1051.9963,37610.903809,0.129901,753745.0,716.49011,,,,22.392261,8678.0,123.0
2,3.0,leh ladakh,jammu kashmir,1.0,93770.0,338.8898,7763.0,0.342138,133487.0,393.89499,,,,29.451669,8705.0,87.0
3,4.0,kargil,jammu kashmir,1.0,86236.0,745.93884,0.0,0.116035,140802.0,188.7581,,,,24.91655,2200.0,44.0
4,5.0,punch,jammu kashmir,1.0,261724.0,414.76535,0.0,0.081013,476835.0,1149.65,,,,23.933174,2589.0,27.0


## Dropping NaN

For clustering, columns cannot have NaN values

In [109]:
columns_to_plot = list(champion.keys())
columns_to_plot

['censuscode',
 'Districtname',
 'pc11_state_name',
 'State id',
 'Literacy',
 'population density',
 'slum population',
 'Urban population share',
 'population',
 'pc11_tot_area',
 'Beds (IC)',
 'Beds (total)',
 'Care facilities (hospitals/clinics)',
 'average_age',
 'total_cases',
 'total_deaths']

In [110]:
# Checking number of nans per column
champion[columns_to_plot].isna().sum()

censuscode                              0
Districtname                            0
pc11_state_name                        18
State id                               18
Literacy                                0
population density                     23
slum population                         0
Urban population share                  0
population                              0
pc11_tot_area                           0
Beds (IC)                              99
Beds (total)                           88
Care facilities (hospitals/clinics)    88
average_age                            23
total_cases                            30
total_deaths                           30
dtype: int64

In [111]:
# Our main interest is the urbanisation rate, if we want to analyse that we'll have to drop the NaNs of that column
# This paper explains that imputing is not a valid option when clustering. this paper suggests other algorithms to replace the missing data but that
# is too advanced
# http://www.litech.org/~wkiri/Papers/wagstaff-missing-ifcs04.pdf
champion = champion[champion['Urban population share'].notna()]

In [112]:
len(champion)

640

In [113]:
# Checking number of nans per column
champion[columns_to_plot].isna().sum()

censuscode                              0
Districtname                            0
pc11_state_name                        18
State id                               18
Literacy                                0
population density                     23
slum population                         0
Urban population share                  0
population                              0
pc11_tot_area                           0
Beds (IC)                              99
Beds (total)                           88
Care facilities (hospitals/clinics)    88
average_age                            23
total_cases                            30
total_deaths                           30
dtype: int64

NOTE : Literacy and slum population now also do not have any missing values

In [114]:
champion.sort_values("population density", ascending = False).head(10)

Unnamed: 0,censuscode,Districtname,pc11_state_name,State id,Literacy,population density,slum population,Urban population share,population,pc11_tot_area,Beds (IC),Beds (total),Care facilities (hospitals/clinics),average_age,total_cases,total_deaths
244,245.0,tawang,arunachal pradesh,12.0,26073.0,4997700.0,0.0,0.224143,49977.0,0.01,0.0,132.63657,17.106094,24.420934,492.0,2.0
279,280.0,chandel,manipur,14.0,90302.0,144182.0,0.0,0.116845,144182.0,1.0,0.0,105.09125,8.23064,25.613669,,
92,93.0,delhi (east),,,1352998.0,134998.11,196756.5,0.997935,1709346.0,12.662,,,,29.417171,0.0,0.0
91,92.0,delhi (north east),,,1611588.0,61283.395,173453.9,0.990397,2241624.0,36.577999,,,,26.856877,0.0,0.0
298,299.0,jaintia hills,meghalaya,17.0,187527.0,49390.5,19310.0,0.071952,395124.0,8.0,4.0,401.34689,18.687963,19.960582,1126.0,1.0
9,10.0,srinagar,jammu kashmir,1.0,748584.0,38791.523,337564.1,0.986002,1236829.0,31.884001,,,,27.089513,32638.0,480.0
95,96.0,delhi (west),,,1961179.0,37681.137,275427.3,0.997476,2543243.0,67.493797,,,,30.087117,0.0,0.0
90,91.0,delhi (north),,,681517.0,36073.207,88075.0,0.980015,887978.0,24.615999,,,,29.115772,0.0,0.0
292,293.0,west garo hills,meghalaya,17.0,358702.0,35114.137,3741.0,0.116367,643291.0,18.32,,352.54468,24.504061,22.639769,1947.0,19.0
602,603.0,chennai,tamil nadu,33.0,3776276.0,26552.754,1342337.0,1.0,4646732.0,175.0,,,,31.406187,269614.0,4334.0


In [115]:
champion.sort_values("population", ascending = False).head(10)

Unnamed: 0,censuscode,Districtname,pc11_state_name,State id,Literacy,population density,slum population,Urban population share,population,pc11_tot_area,Beds (IC),Beds (total),Care facilities (hospitals/clinics),average_age,total_cases,total_deaths
516,517.0,thane,maharashtra,27.0,8227161.0,1088.5101,1331573.0,0.769852,11060148.0,10160.813,21.0,2713.8865,305.85144,28.025309,485142.0,7414.0
336,337.0,north twenty four parganas,west bengal,19.0,7608693.0,2519.7971,1428654.0,0.572656,10009781.0,3972.4551,0.0,2979.7153,161.08615,29.217597,134710.0,2560.0
571,572.0,bangalore,,,7512276.0,4360.916,722290.0,0.909411,9621551.0,2206.3142,43.0,2885.8208,357.73651,28.645265,493869.0,4910.0
520,521.0,pune,maharashtra,27.0,7171723.0,570.39642,865248.0,0.60992,9429408.0,16531.324,22.0,2043.0588,231.66635,28.464921,671426.0,8629.0
517,518.0,unknown,,,7575485.0,15517.35,3915392.0,1.0,9356962.0,603.0,,,,28.842874,0.0,0.0
342,343.0,south twenty four parganas,west bengal,19.0,5531657.0,1477.693,332347.0,0.255793,8161961.0,5523.4482,6.0,2984.3682,127.26441,27.174307,40238.0,719.0
334,335.0,barddhaman,west bengal,19.0,5247208.0,1098.8033,675615.0,0.398869,7717563.0,7023.6069,42.0,3361.0564,253.84354,28.738065,32240.0,279.0
473,474.0,ahmadabad,,,5435760.0,912.97119,257181.0,0.840429,7214225.0,7901.9194,,,,29.193032,87181.566949,2489.818663
332,333.0,murshidabad,west bengal,19.0,4055834.0,1327.5674,269021.0,0.197175,7103807.0,5350.9956,99.0,2278.4355,139.15341,25.780712,13552.0,152.0
109,110.0,jaipur,rajasthan,8.0,4300965.0,597.99817,323962.0,0.523959,6626178.0,11080.599,10.0,2502.2129,249.76062,25.610464,68918.0,540.0


In [116]:
#new dehli verdelen over zijn pop zowel deaths als cases

#indices are 89:97
champion.iloc[89:98]

#values found (hardcoded, value was only registering city centre)
total_cases_nd = 750156.0
total_deaths_nd = 11436.0


champion.iloc[89:98]['total_cases'] = total_cases_nd
champion.iloc[89:98]['total_deaths'] = total_deaths_nd
champion.iloc[89:98]['pc11_state_name'] = 'NCT of Delhi'
champion.iloc[89:98]['pc11_state_id'] = 7.0

champion.iloc[89:98]


Unnamed: 0,censuscode,Districtname,pc11_state_name,State id,Literacy,population density,slum population,Urban population share,population,pc11_tot_area,Beds (IC),Beds (total),Care facilities (hospitals/clinics),average_age,total_cases,total_deaths
89,90.0,delhi (north west),,,2707855.0,10354.479,376234.050781,0.941488,3656539.0,353.13599,,,,28.811548,0.0,0.0
90,91.0,delhi (north),,,681517.0,36073.207,88075.000977,0.980015,887978.0,24.615999,,,,29.115772,0.0,0.0
91,92.0,delhi (north east),,,1611588.0,61283.395,173453.886719,0.990397,2241624.0,36.577999,,,,26.856877,0.0,0.0
92,93.0,delhi (east),,,1352998.0,134998.11,196756.546875,0.997935,1709346.0,12.662,,,,29.417171,0.0,0.0
93,94.0,delhi (new delhi),,,114179.0,,11215.94751,1.0,142004.0,0.0,,,,29.946973,750156.0,11436.0
94,95.0,delhi (central),,,442360.0,,85146.181107,1.0,582320.0,0.0,,,,29.089092,,
95,96.0,delhi (west),,,1961179.0,37681.137,275427.277344,0.997476,2543243.0,67.493797,,,,30.087117,0.0,0.0
96,97.0,delhi (south west),,,1787689.0,8243.7803,259683.193726,0.93734,2292958.0,278.14401,,,,27.868524,0.0,0.0
97,98.0,delhi (north east),,,2078402.0,20443.797,319397.919907,0.995537,2731929.0,133.6312,,,,27.756417,0.0,0.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  champion.iloc[89:98]['total_cases'] = total_cases_nd
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  champion.iloc[89:98]['total_deaths'] = total_deaths_nd
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  champion.iloc[89:98]['pc11_state_name'] = 'NCT of Delhi'
A value is trying to be set on a copy of

Unnamed: 0,censuscode,Districtname,pc11_state_name,State id,Literacy,population density,slum population,Urban population share,population,pc11_tot_area,Beds (IC),Beds (total),Care facilities (hospitals/clinics),average_age,total_cases,total_deaths
89,90.0,delhi (north west),NCT of Delhi,,2707855.0,10354.479,376234.050781,0.941488,3656539.0,353.13599,,,,28.811548,750156.0,11436.0
90,91.0,delhi (north),NCT of Delhi,,681517.0,36073.207,88075.000977,0.980015,887978.0,24.615999,,,,29.115772,750156.0,11436.0
91,92.0,delhi (north east),NCT of Delhi,,1611588.0,61283.395,173453.886719,0.990397,2241624.0,36.577999,,,,26.856877,750156.0,11436.0
92,93.0,delhi (east),NCT of Delhi,,1352998.0,134998.11,196756.546875,0.997935,1709346.0,12.662,,,,29.417171,750156.0,11436.0
93,94.0,delhi (new delhi),NCT of Delhi,,114179.0,,11215.94751,1.0,142004.0,0.0,,,,29.946973,750156.0,11436.0
94,95.0,delhi (central),NCT of Delhi,,442360.0,,85146.181107,1.0,582320.0,0.0,,,,29.089092,750156.0,11436.0
95,96.0,delhi (west),NCT of Delhi,,1961179.0,37681.137,275427.277344,0.997476,2543243.0,67.493797,,,,30.087117,750156.0,11436.0
96,97.0,delhi (south west),NCT of Delhi,,1787689.0,8243.7803,259683.193726,0.93734,2292958.0,278.14401,,,,27.868524,750156.0,11436.0
97,98.0,delhi (north east),NCT of Delhi,,2078402.0,20443.797,319397.919907,0.995537,2731929.0,133.6312,,,,27.756417,750156.0,11436.0


In [117]:
champion['deaths per 100.000'] = champion['total_deaths']/champion['population']*1e5
champion['cases per 100.000'] = champion['total_cases']/champion['population']*1e5

champion.sort_values(by='censuscode')
champion

Unnamed: 0,censuscode,Districtname,pc11_state_name,State id,Literacy,population density,slum population,Urban population share,population,pc11_tot_area,Beds (IC),Beds (total),Care facilities (hospitals/clinics),average_age,total_cases,total_deaths,deaths per 100.000,cases per 100.000
0,1.0,kupwara,jammu kashmir,1.0,439654.0,1312.16980,20475.000000,0.120329,870354.0,663.293700,,,,23.130876,6057.0,98.0,11.259786,695.923728
1,2.0,badgam,jammu kashmir,1.0,335649.0,1051.99630,37610.903809,0.129901,753745.0,716.490110,,,,22.392261,8678.0,123.0,16.318516,1151.317753
2,3.0,leh ladakh,jammu kashmir,1.0,93770.0,338.88980,7763.000000,0.342138,133487.0,393.894990,,,,29.451669,8705.0,87.0,65.174886,6521.234277
3,4.0,kargil,jammu kashmir,1.0,86236.0,745.93884,0.000000,0.116035,140802.0,188.758100,,,,24.916550,2200.0,44.0,31.249556,1562.477806
4,5.0,punch,jammu kashmir,1.0,261724.0,414.76535,0.000000,0.081013,476835.0,1149.650000,,,,23.933174,2589.0,27.0,5.662336,542.955110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
635,636.0,mahe,puducherry,34.0,36470.0,4646.22220,4059.000000,1.000000,41816.0,9.000000,21.0,215.88113,16.881138,,2448.0,12.0,28.697149,5854.218481
636,637.0,karaikal,puducherry,34.0,154916.0,1251.77480,26890.000000,0.489966,200222.0,159.950500,,151.35823,23.850254,,5231.0,80.0,39.955649,2612.600014
637,638.0,nicobars,andaman nicobar islands,35.0,25332.0,527.01520,0.000000,0.000000,36842.0,69.906898,0.0,218.62912,12.533057,,,,,
638,639.0,north middle andaman,andaman nicobar islands,35.0,78683.0,258.90808,0.000000,0.025957,105597.0,407.855190,0.0,444.58643,20.856722,,0.0,0.0,0.000000,0.000000


Unnamed: 0,censuscode,Districtname,pc11_state_name,State id,Literacy,population density,slum population,Urban population share,population,pc11_tot_area,Beds (IC),Beds (total),Care facilities (hospitals/clinics),average_age,total_cases,total_deaths,deaths per 100.000,cases per 100.000
0,1.0,kupwara,jammu kashmir,1.0,439654.0,1312.16980,20475.000000,0.120329,870354.0,663.293700,,,,23.130876,6057.0,98.0,11.259786,695.923728
1,2.0,badgam,jammu kashmir,1.0,335649.0,1051.99630,37610.903809,0.129901,753745.0,716.490110,,,,22.392261,8678.0,123.0,16.318516,1151.317753
2,3.0,leh ladakh,jammu kashmir,1.0,93770.0,338.88980,7763.000000,0.342138,133487.0,393.894990,,,,29.451669,8705.0,87.0,65.174886,6521.234277
3,4.0,kargil,jammu kashmir,1.0,86236.0,745.93884,0.000000,0.116035,140802.0,188.758100,,,,24.916550,2200.0,44.0,31.249556,1562.477806
4,5.0,punch,jammu kashmir,1.0,261724.0,414.76535,0.000000,0.081013,476835.0,1149.650000,,,,23.933174,2589.0,27.0,5.662336,542.955110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
635,636.0,mahe,puducherry,34.0,36470.0,4646.22220,4059.000000,1.000000,41816.0,9.000000,21.0,215.88113,16.881138,,2448.0,12.0,28.697149,5854.218481
636,637.0,karaikal,puducherry,34.0,154916.0,1251.77480,26890.000000,0.489966,200222.0,159.950500,,151.35823,23.850254,,5231.0,80.0,39.955649,2612.600014
637,638.0,nicobars,andaman nicobar islands,35.0,25332.0,527.01520,0.000000,0.000000,36842.0,69.906898,0.0,218.62912,12.533057,,,,,
638,639.0,north middle andaman,andaman nicobar islands,35.0,78683.0,258.90808,0.000000,0.025957,105597.0,407.855190,0.0,444.58643,20.856722,,0.0,0.0,0.000000,0.000000


In [118]:
## TO DO

In [119]:
champion.sort_values(by= 'population density', ascending = False)

Unnamed: 0,censuscode,Districtname,pc11_state_name,State id,Literacy,population density,slum population,Urban population share,population,pc11_tot_area,Beds (IC),Beds (total),Care facilities (hospitals/clinics),average_age,total_cases,total_deaths,deaths per 100.000,cases per 100.000
244,245.0,tawang,arunachal pradesh,12.0,26073.0,4997700.000,0.000000e+00,0.224143,49977.0,0.010000,0.0,132.63657,17.106094,24.420934,492.0,2.0,4.001841,984.452848
279,280.0,chandel,manipur,14.0,90302.0,144182.000,0.000000e+00,0.116845,144182.0,1.000000,0.0,105.09125,8.230640,25.613669,,,,
92,93.0,delhi (east),NCT of Delhi,,1352998.0,134998.110,1.967565e+05,0.997935,1709346.0,12.662000,,,,29.417171,750156.0,11436.0,669.027804,43885.556230
91,92.0,delhi (north east),NCT of Delhi,,1611588.0,61283.395,1.734539e+05,0.990397,2241624.0,36.577999,,,,26.856877,750156.0,11436.0,510.165844,33464.845130
298,299.0,jaintia hills,meghalaya,17.0,187527.0,49390.500,1.931000e+04,0.071952,395124.0,8.000000,4.0,401.34689,18.687963,19.960582,1126.0,1.0,0.253085,284.973831
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272,273.0,tamenglong,manipur,14.0,85006.0,,0.000000e+00,0.137667,140651.0,0.000000,0.0,171.16982,10.925348,24.786920,,,,
273,274.0,churachandpur,manipur,14.0,195935.0,,0.000000e+00,0.066961,274143.0,0.000000,0.0,351.64264,45.753204,26.207362,,,,
278,279.0,ukhrul,manipur,14.0,129829.0,,0.000000e+00,0.147757,183998.0,0.000000,0.0,127.78529,11.248764,25.695741,,,,
286,287.0,lawngtlai,mizoram,15.0,62861.0,,0.000000e+00,0.176684,117894.0,0.000000,0.0,176.95961,10.397307,22.408689,345.0,0.0,0.000000,292.635758


In [120]:
#names_frame 549 slum pop is raar

champion.iloc[549][['Districtname','population','slum population']]
champion['population density2'] = champion['population']/champion['pc11_tot_area']


Districtname       sri potti sriramulu nellore
population                           2963557.0
slum population                  283674.000565
Name: 549, dtype: object

In [121]:

missingarea = champion[champion['pc11_tot_area'] == 0]
missingarea
#there is still a lot of areas missing from athe data, we should fill this manually?

# we drop everything with an area smaller than 9, because that is the smallest area. The rest is just wrong.
champion = champion[champion['pc11_tot_area'] < 9]
champion = champion[champion['population density'] < 30000]
champion.sort_values(by= 'population density2', ascending = False)

Unnamed: 0,censuscode,Districtname,pc11_state_name,State id,Literacy,population density,slum population,Urban population share,population,pc11_tot_area,Beds (IC),Beds (total),Care facilities (hospitals/clinics),average_age,total_cases,total_deaths,deaths per 100.000,cases per 100.000,population density2
93,94.0,delhi (new delhi),NCT of Delhi,,114179.0,,1.121595e+04,1.000000,142004.0,0.0,,,,29.946973,750156.0,11436.0,8053.294273,528263.992564,inf
94,95.0,delhi (central),NCT of Delhi,,442360.0,,8.514618e+04,1.000000,582320.0,0.0,,,,29.089092,750156.0,11436.0,1963.868663,128821.953565,inf
246,247.0,east kameng,arunachal pradesh,12.0,38449.0,,1.124000e+03,0.233194,78690.0,0.0,0.0,215.21883,28.068426,21.260834,436.0,0.0,0.000000,554.072944,inf
248,249.0,upper subansiri,arunachal pradesh,12.0,45278.0,,0.000000e+00,0.160639,83448.0,0.0,0.0,157.24138,37.551723,22.230371,443.0,2.0,2.396702,530.869524,inf
251,252.0,upper siang,arunachal pradesh,12.0,18195.0,,0.000000e+00,0.185164,35320.0,0.0,0.0,135.67581,17.122875,26.313845,361.0,0.0,0.000000,1022.083805,inf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272,273.0,tamenglong,manipur,14.0,85006.0,,0.000000e+00,0.137667,140651.0,0.0,0.0,171.16982,10.925348,24.786920,,,,,inf
273,274.0,churachandpur,manipur,14.0,195935.0,,0.000000e+00,0.066961,274143.0,0.0,0.0,351.64264,45.753204,26.207362,,,,,inf
278,279.0,ukhrul,manipur,14.0,129829.0,,0.000000e+00,0.147757,183998.0,0.0,0.0,127.78529,11.248764,25.695741,,,,,inf
286,287.0,lawngtlai,mizoram,15.0,62861.0,,0.000000e+00,0.176684,117894.0,0.0,0.0,176.95961,10.397307,22.408689,345.0,0.0,0.000000,292.635758,inf


Unnamed: 0,censuscode,Districtname,pc11_state_name,State id,Literacy,population density,slum population,Urban population share,population,pc11_tot_area,Beds (IC),Beds (total),Care facilities (hospitals/clinics),average_age,total_cases,total_deaths,deaths per 100.000,cases per 100.000,population density2
294,295.0,south garo hills,meghalaya,17.0,82062.0,16174.317,7253.0,0.092255,142334.0,8.8,0.0,239.25966,8.536253,21.649079,198.0,0.0,0.0,139.109419,16174.317814


In [122]:

#density 563 population density

champion.iloc[563][['Districtname','population','population density']]


IndexError: single positional indexer is out-of-bounds

In [None]:
champion