# IDMC geolocation
Script to extract sub-national information out of IDMC entries

In [1]:
# Import modules
import geopandas as gpd
import pandas as pd
import numpy as np
import sys
import os
import re
from difflib import SequenceMatcher
from rasterstats import zonal_stats
import unidecode

# Make paths

def mkdir(dir):    
    if not os.path.exists(dir):
        os.mkdir(dir)
        
path_run = os.getcwd() + '/'
path_data = path_run + 'data/'
path_data_creator = mkdir(path_data)
path_data_processed = path_run + 'data_processed/'
path_data_processed_creator = mkdir(path_data_processed)
path_IDMC_geolocation = path_data_processed + 'IDMC_geolocation/'
path_IDMC_geolocation_creator = mkdir(path_IDMC_geolocation)

# Define decoding which converts diacritics to letters of the alphabet for Modern English (transliteration process)

def decoding(x):
    
    try:
        x = unidecode.unidecode(x)
    except:
        pass
    return(x)

# Word similarity threshold

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

world_sim_threshold = 0.8

In [None]:
# Extract subnational information out of IDMC entries, and assign the corresponding GADM pronvinces and districts.
# Load data
countries = gpd.read_file(path_data + 'GADM/gadm36_0.shp')

provinces = gpd.read_file(path_data + 'GADM/gadm36_1.shp')
provinces = pd.DataFrame(provinces)
provinces['NAME_1'] = provinces['NAME_1'].apply(lambda x: decoding(x)) 
provinces['VARNAME_1'] = provinces['VARNAME_1'].apply(lambda x: decoding(x)) 
provinces_list = provinces['NAME_1']
provinces_list_alt = provinces['VARNAME_1']

districts = gpd.read_file(path_data + 'GADM/gadm36_2.shp')
districts = pd.DataFrame(districts)
districts['NAME_1'] = districts['NAME_1'].apply(lambda x: decoding(x)) 
districts['NAME_2'] = districts['NAME_2'].apply(lambda x: decoding(x)) 
districts['VARNAME_2'] = districts['VARNAME_2'].apply(lambda x: decoding(x)) 
districts_list = districts['NAME_2'] 
districts_list_alt = districts['VARNAME_2'] 
districts_list_name_1 = districts['NAME_1']
districts_list_alt_2 = []

subdistricts = gpd.read_file(path_data + 'GADM/gadm36_3.shp')
subdistricts = pd.DataFrame(subdistricts)
subdistricts['NAME_1'] = subdistricts['NAME_1'].apply(lambda x: decoding(x)) 
subdistricts['NAME_2'] = subdistricts['NAME_2'].apply(lambda x: decoding(x)) 
subdistricts['NAME_3'] = subdistricts['NAME_3'].apply(lambda x: decoding(x)) 
subdistricts['VARNAME_3'] = subdistricts['VARNAME_3'].apply(lambda x: decoding(x)) 
subdistricts_list = subdistricts['NAME_3'] 
subdistricts_list_alt = subdistricts['VARNAME_3'] 
subdistricts_list_name_1 = subdistricts['NAME_1']
subdistricts_list_name_2 = subdistricts['NAME_2']
subdistricts_list_alt_3 = []

# Compute column with alternative names
provinces_list_alt_2 = []

for i in range(len(provinces_list_alt)):
    string = provinces_list_alt[i]
    
    if string is not None:
        str_list = [l.split('-') for l in string.split('|')]
        provinces_list_alt_2.append(str_list)
        
    else:        
        provinces_list_alt_2.append('')
        
provinces['VARNAME_1_2'] = provinces_list_alt_2

for i in range(len(districts_list_alt)):
    string = districts_list_alt[i]
    
    if string is not None:
        str_list = [l.split('-') for l in string.split('|')]
        districts_list_alt_2.append(str_list)
        
    else:        
        districts_list_alt_2.append('')
        
districts['VARNAME_2_2'] = districts_list_alt_2

for i in range(len(subdistricts_list_alt)):
    string = subdistricts_list_alt[i]
    
    if string is not None:
        str_list = [l.split('-') for l in string.split('|')]
        subdistricts_list_alt_3.append(str_list)
        
    else:        
        subdistricts_list_alt_3.append('')
        
subdistricts['VARNAME_3_3'] = subdistricts_list_alt_3

In [2]:
# Word similarity threshold test

print("word similarity score: similar('Tamanrasset','Tamanghasset')")
print("2*10/23:",2*10/23)

word similarity score: similar('Tamanrasset','Tamanghasset')
2*10/23: 0.8695652173913043


In [None]:
# Prepare disaster df
# Data source: https://www.internal-displacement.org/database/displacement-data (downloaded: 2022-10-17)
# Remove 2nd row and save as .csv
disaster = pd.read_csv(path_data + 'IDMC/IDMC_Internal_Displacement_Disasters_Events_2008_2021.csv') 
disaster['countries_forUKonly'] = np.nan
disaster['provinces'] = np.nan
disaster['GID_1'] = np.nan
disaster['districts'] = np.nan
disaster['GID_2'] = np.nan
disaster['country_only_info'] = np.nan
disaster['num_provinces'] = np.nan
disaster['num_districts'] = np.nan
disaster = disaster.rename(columns={"Disaster New Displacements": "displacements"})
disaster = disaster.rename(columns={"Event Name": "event_name"})
disaster['event_name'] = disaster['event_name'].apply(lambda x: decoding(x)) 
disaster = disaster.reset_index(drop=True)

# Countries (important for UK)

In [None]:
# UK countries using GADM provinces 
for i in range(len(disaster)):  
    country = disaster['ISO3'][i]
    text = disaster['event_name'][i] 
    
    if country == 'GBR':        
        temp_province = ''
        temp_GID_1 = ''
        
        if not pd.isnull(text): # check if event_name is not empty        
            provinces_temp = provinces[provinces['GID_0']==country]
            
            for k in range(len(provinces_temp)):
                        
                if provinces_temp['NAME_1'].iloc[k]:
                                          
                    if provinces_temp['NAME_1'].iloc[k] in text:

                        if temp_province == '':    
                            temp_province = temp_province + provinces_temp['NAME_1'].iloc[k]
                            temp_GID_1 = temp_GID_1 + provinces_temp['GID_1'].iloc[k]

                        else:                                
                            temp_province = temp_province + ',' + provinces_temp['NAME_1'].iloc[k]
                            temp_GID_1 = temp_GID_1 + ',' + provinces_temp['GID_1'].iloc[k]
                                    
        all_provinces = temp_province.split(",")
        unique_provinces = ",".join(sorted(set(all_provinces), key=all_provinces.index))        
        all_GID_1 = temp_GID_1.split(",")
        unique_GID_1 = ",".join(sorted(set(all_GID_1), key=all_GID_1.index))             
        disaster.loc[disaster.index==i,'countries_forUKonly'] = unique_provinces

# Provinces

In [None]:
for i in range(len(disaster)):
    country = disaster['ISO3'][i]
    text = disaster['event_name'][i] 
    
    if country != 'GBR':  # exclude UK, see below next section  
        temp_province = ''
        temp_GID_1 = ''
        
        if not pd.isnull(text): # check if event_name is not empty
            provinces_temp = provinces[provinces['GID_0']==country]
        
            for k in range(len(provinces_temp)):  
                
                if provinces_temp['NAME_1'].iloc[k] in text: # check if province is 100% in IDMC text 
                               
                    if temp_province == '':                     
                        temp_province = temp_province + provinces_temp['NAME_1'].iloc[k]
                        temp_GID_1 = temp_GID_1 + provinces_temp['GID_1'].iloc[k]            
                    
                    else: # if not empty, new province is added with ","                
                        temp_province = temp_province + ',' + provinces_temp['NAME_1'].iloc[k]
                        temp_GID_1 = temp_GID_1 + ',' + provinces_temp['GID_1'].iloc[k]                       
                        
                # check for every word if similarity > world_sim_threshold       
                word_list = re.split('\W+', text)
                
                for word_single in word_list:
                    
                    if (similar(word_single,provinces_temp['NAME_1'].iloc[k])) > world_sim_threshold:
                        
                        if temp_province == '':
                            temp_province = temp_province + provinces_temp['NAME_1'].iloc[k]
                            temp_GID_1 = temp_GID_1 + provinces_temp['GID_1'].iloc[k]
                                       
                        else: # if not empty, new province is added with ","
                            temp_province = temp_province + ',' + provinces_temp['NAME_1'].iloc[k]
                            temp_GID_1 = temp_GID_1 + ',' + provinces_temp['GID_1'].iloc[k]
                                                 
                # check with alt GADM names                
                if provinces_temp['VARNAME_1_2'].iloc[k]:               
                        
                    for l in range(len(provinces_temp['VARNAME_1_2'].iloc[k])):            
                        provinces_single = ''.join(provinces_temp['VARNAME_1_2'].iloc[k][l])
            
                        if provinces_single in text:
                            
                            if temp_province == '':            
                                temp_province = temp_province + provinces_temp.NAME_1.iloc[k] 
                                temp_GID_1 = temp_GID_1 + provinces_temp.GID_1.iloc[k]
            
                            else:            
                                temp_province = temp_province + ',' + provinces_temp.NAME_1.iloc[k] 
                                temp_GID_1 = temp_GID_1 + ',' + provinces_temp.GID_1.iloc[k]
                                           
                    # check if similarity > world_sim_threshold                            
                    word_list = re.split('\W+', text)
                    
                    for word_single in word_list:
                        
                        for l in range(len(provinces_temp['VARNAME_1_2'].iloc[k])):            
                            provinces_single = ''.join(provinces_temp['VARNAME_1_2'].iloc[k][l])
                        
                            if similar(word_single,provinces_single) > world_sim_threshold: #and country == provinces['GID_0'].iloc[k]:
                                
                                if temp_province == '':            
                                    temp_province = temp_province + provinces_temp.NAME_1.iloc[k]
                                    temp_GID_1 = temp_GID_1 + provinces_temp.GID_1.iloc[k]
            
                                else: # if not empty, new province is added with ","            
                                    temp_province = temp_province + ',' + provinces_temp.NAME_1.iloc[k]
                                    temp_GID_1 = temp_GID_1 + ',' + provinces_temp.GID_1.iloc[k]
                                    
        else:            
            disaster.loc[disaster.index==i,'country_only_info'] = country
    
        all_provinces = temp_province.split(",")
        unique_provinces = ",".join(sorted(set(all_provinces), key=all_provinces.index))        
        all_GID_1 = temp_GID_1.split(",")
        unique_GID_1 = ",".join(sorted(set(all_GID_1), key=all_GID_1.index))     
        disaster.loc[disaster.index==i,'provinces'] = unique_provinces
        disaster.loc[disaster.index==i,'GID_1'] = unique_GID_1

In [None]:
# UK provinces using GADM districts 
for i in range(len(disaster)):  
    country = disaster['ISO3'][i]
    text = disaster['event_name'][i] 
    
    if country == 'GBR':        
        temp_district = ''
        temp_GID_2 = ''
        
        if not pd.isnull(text): # check if event_name is not empty        
            districts_temp = districts[districts['GID_0']==country]

            for k in range(len(districts_temp)):
                        
                if districts_temp['NAME_2'].iloc[k]:
                                    
                    if districts_temp['NAME_2'].iloc[k] != country:                                 
                        
                        if districts_temp['NAME_2'].iloc[k] in text:
                                                  
                            if temp_district == '':    
                                temp_district = temp_district + districts_temp['NAME_2'].iloc[k]
                                temp_GID_2 = temp_GID_2 + districts_temp['GID_2'].iloc[k]
                                
                            else:                                
                                temp_district = temp_district + ',' + districts_temp['NAME_2'].iloc[k]
                                temp_GID_2 = temp_GID_2 + ',' + districts_temp['GID_2'].iloc[k]
                                    
                        # check if similarity > world_sim_threshold                         
                        word_list = re.split('\W+', text)
        
                        for word_single in word_list:        
                            if (similar(word_single,districts_temp['NAME_2'].iloc[k])) > world_sim_threshold:
                                                       
                                if temp_district == '':        
                                    temp_district = temp_district + districts_temp['NAME_2'].iloc[k]
                                    temp_GID_2 = temp_GID_2 + districts_temp['GID_2'].iloc[k]
                                    
                                else:                                    
                                    temp_district = temp_district + ',' + districts_temp['NAME_2'].iloc[k]
                                    temp_GID_2 = temp_GID_2 + ',' + districts_temp['GID_2'].iloc[k]
                                                   
                            # Varname                                
                            for l in range(len(districts_temp['VARNAME_2_2'].iloc[k])):        
                                districts_single = ''.join(districts_temp['VARNAME_2_2'].iloc[k][l])
        
                                if districts_single in text:                        
        
                                    if temp_district == '':        
                                        temp_district = temp_district + districts_temp['NAME_2'].iloc[k]
                                        temp_GID_2 = temp_GID_2 + districts_temp['GID_2'].iloc[k]
        
                                    else:        
                                        temp_district = temp_district + ',' + districts_temp['NAME_2'].iloc[k]
                                        temp_GID_2 = temp_GID_2 + ',' + districts_temp['GID_2'].iloc[k]  
                                        
                            # check if similarity > world_sim_threshold                        
                            word_list = re.split('\W+', text)
                            
                            for word_single in word_list:
        
                                for l in range(len(districts_temp['VARNAME_2_2'].iloc[k])):        
                                    districts_single = ''.join(districts_temp['VARNAME_2_2'].iloc[k][l])
        
                                    if (similar(word_single,districts_single)) > world_sim_threshold:
                                        
                                        if temp_district == '':        
                                            temp_district = temp_district + districts_temp['NAME_2'].iloc[k]
                                            temp_GID_2 = temp_GID_2 + districts_temp['GID_2'].iloc[k]
        
                                        else:        
                                            temp_district = temp_district + ',' + districts_temp['NAME_2'].iloc[k]
                                            temp_GID_2 = temp_GID_2 + ',' + districts_temp['GID_2'].iloc[k]
                                        
        else:            
            disaster.loc[disaster.index==i,'country_only_info'] = country
    
        all_districts = temp_district.split(",")
        unique_districts = ",".join(sorted(set(all_districts), key=all_districts.index))        
        all_GID_2 = temp_GID_2.split(",")
        unique_GID_2 = ",".join(sorted(set(all_GID_2), key=all_GID_2.index))             
        disaster.loc[disaster.index==i,'provinces'] = unique_districts
        disaster.loc[disaster.index==i,'GID_1'] = unique_GID_2

# Districts

In [None]:
for i in range(len(disaster)):
    country = disaster['ISO3'][i]
    text = disaster['event_name'][i] 
    
    if country != 'GBR':    
        temp_district = ''
        temp_GID_2 = ''
        
        if not pd.isnull(text): # check if event_name is not empty 
            districts_temp = districts[districts['GID_0']==country]
        
            for k in range(len(districts_temp)):
                
                if districts_temp['NAME_2'].iloc[k]:
                         
                    if districts_temp['NAME_2'].iloc[k] in text : # check if district is 100% in IDMC text 

                        if temp_district == '':
                            temp_district = temp_district + districts_temp['NAME_2'].iloc[k]
                            temp_GID_2 = temp_GID_2 + districts_temp['GID_2'].iloc[k]            

                        else: # if not empty, new district is added with ","
                            temp_district = temp_district + ',' + districts_temp['NAME_2'].iloc[k]
                            temp_GID_2 = temp_GID_2 + ',' + districts_temp['GID_2'].iloc[k]                       

                    # check for every word if similarity > world_sim_threshold
                    word_list = re.split('\W+', text)

                    for word_single in word_list:

                        if (similar(word_single,districts_temp['NAME_2'].iloc[k])) > world_sim_threshold:

                            if temp_district == '':
                                temp_district = temp_district + districts_temp['NAME_2'].iloc[k]
                                temp_GID_2 = temp_GID_2 + districts_temp['GID_2'].iloc[k]

                            else: # if not empty, new district is added with ","
                                temp_district = temp_district + ',' + districts_temp['NAME_2'].iloc[k]
                                temp_GID_2 = temp_GID_2 + ',' + districts_temp['GID_2'].iloc[k]

                    # check with alt GADM names

                    if districts_temp['VARNAME_2_2'].iloc[k]:               

                        for l in range(len(districts_temp['VARNAME_2_2'].iloc[k])):
                            districts_single = ''.join(districts_temp['VARNAME_2_2'].iloc[k][l])

                            if districts_single in text:

                                if temp_district == '':
                                    temp_district = temp_district + districts_temp.NAME_2.iloc[k] 
                                    temp_GID_2 = temp_GID_2 + districts_temp.GID_2.iloc[k]

                                else:
                                    temp_district = temp_district + ',' + districts_temp.NAME_2.iloc[k] 
                                    temp_GID_2 = temp_GID_2 + ',' + districts_temp.GID_2.iloc[k]

                        # check if similarity > world_sim_threshold
                        word_list = re.split('\W+', text)

                        for word_single in word_list:

                            for l in range(len(districts_temp['VARNAME_2_2'].iloc[k])):
                                districts_single = ''.join(districts_temp['VARNAME_2_2'].iloc[k][l])

                                if similar(word_single,districts_single) > world_sim_threshold: #and country == districts['GID_0'].iloc[k]:

                                    if temp_district == '':
                                        temp_district = temp_district + districts_temp.NAME_2.iloc[k]
                                        temp_GID_2 = temp_GID_2 + districts_temp.GID_2.iloc[k]

                                    else: # if not empty, new district is added with ","
                                        temp_district = temp_district + ',' + districts_temp.NAME_2.iloc[k]
                                        temp_GID_2 = temp_GID_2 + ',' + districts_temp.GID_2.iloc[k]
                                    
        else:            
            disaster.loc[disaster.index==i,'country_only_info'] = country
    
        all_districts = temp_district.split(",")
        unique_districts = ",".join(sorted(set(all_districts), key=all_districts.index))        
        all_GID_2 = temp_GID_2.split(",")
        unique_GID_2 = ",".join(sorted(set(all_GID_2), key=all_GID_2.index))      
        disaster.loc[disaster.index==i,'districts'] = unique_districts
        disaster.loc[disaster.index==i,'GID_2'] = unique_GID_2

In [None]:
# UK districts using GADM subdistricts
for i in range(len(disaster)):  
    country = disaster['ISO3'][i]
    text = disaster['event_name'][i] 
    
    if country == 'GBR':        
        temp_subdistrict = ''
        temp_GID_3 = ''
        
        if not pd.isnull(text): # check if event_name is not empty        
            subdistricts_temp = subdistricts[subdistricts['GID_0']==country]

            for k in range(len(subdistricts_temp)):
                        
                if subdistricts_temp['NAME_3'].iloc[k]:
                                    
                    if subdistricts_temp['NAME_3'].iloc[k] != country:                                 
                        
                        if subdistricts_temp['NAME_3'].iloc[k] in text:
                                                  
                            if temp_subdistrict == '':    
                                temp_subdistrict = temp_subdistrict + subdistricts_temp['NAME_3'].iloc[k]
                                temp_GID_3 = temp_GID_3 + subdistricts_temp['GID_3'].iloc[k]
                                
                            else:                                
                                temp_subdistrict = temp_subdistrict + ',' + subdistricts_temp['NAME_3'].iloc[k]
                                temp_GID_3 = temp_GID_3 + ',' + subdistricts_temp['GID_3'].iloc[k]
                                    
                        # check if similarity > world_sim_threshold
                        word_list = re.split('\W+', text)
        
                        for word_single in word_list:
        
                            if (similar(word_single,subdistricts_temp['NAME_3'].iloc[k])) > world_sim_threshold:
                                                       
                                if temp_subdistrict == '':        
                                    temp_subdistrict = temp_subdistrict + subdistricts_temp['NAME_3'].iloc[k]
                                    temp_GID_3 = temp_GID_3 + subdistricts_temp['GID_3'].iloc[k]
                                    
                                else:                                    
                                    temp_subdistrict = temp_subdistrict + ',' + subdistricts_temp['NAME_3'].iloc[k]
                                    temp_GID_3 = temp_GID_3 + ',' + subdistricts_temp['GID_3'].iloc[k]
                                                   
                            # Varname                                
                            for l in range(len(subdistricts_temp['VARNAME_3_3'].iloc[k])):        
                                subdistricts_single = ''.join(subdistricts_temp['VARNAME_3_3'].iloc[k][l])
        
                                if subdistricts_single in text:                        
        
                                    if temp_subdistrict == '':        
                                        temp_subdistrict = temp_subdistrict + subdistricts_temp['NAME_3'].iloc[k]
                                        temp_GID_3 = temp_GID_3 + subdistricts_temp['GID_3'].iloc[k]
        
                                    else:        
                                        temp_subdistrict = temp_subdistrict + ',' + subdistricts_temp['NAME_3'].iloc[k]
                                        temp_GID_3 = temp_GID_3 + ',' + subdistricts_temp['GID_3'].iloc[k]                                                                        
                                        
                            # check if similarity > world_sim_threshold                        
                            word_list = re.split('\W+', text)
        
                            for word_single in word_list:
        
                                for l in range(len(subdistricts_temp['VARNAME_3_3'].iloc[k])):        
                                    subdistricts_single = ''.join(subdistricts_temp['VARNAME_3_3'].iloc[k][l])
        
                                    if (similar(word_single,subdistricts_single)) > world_sim_threshold:
                                        
                                        if temp_subdistrict == '':        
                                            temp_subdistrict = temp_subdistrict + subdistricts_temp['NAME_3'].iloc[k]
                                            temp_GID_3 = temp_GID_3 + subdistricts_temp['GID_3'].iloc[k]
        
                                        else:        
                                            temp_subdistrict = temp_subdistrict + ',' + subdistricts_temp['NAME_3'].iloc[k]
                                            temp_GID_3 = temp_GID_3 + ',' + subdistricts_temp['GID_3'].iloc[k]                    
                  
        else:            
            disaster.loc[disaster.index==i,'country_only_info'] = country
        
        all_subdistricts = temp_subdistrict.split(",")
        unique_subdistricts = ",".join(sorted(set(all_subdistricts), key=all_subdistricts.index))
        all_GID_3 = temp_GID_3.split(",")
        unique_GID_3 = ",".join(sorted(set(all_GID_3), key=all_GID_3.index))  
        disaster.loc[disaster.index==i,'districts'] = unique_subdistricts
        disaster.loc[disaster.index==i,'GID_2'] = unique_GID_3

# Extract provinces and districts into single columns

In [None]:
# Determine the number of provinces per disaster event to create x columns
num_of_provinces_list = []

for i in range(len(disaster)):

    if disaster.provinces.iloc[i] != '' and type(disaster.provinces.iloc[i]) == str:   
        provinces_string = disaster.provinces.iloc[i]
        result = [x.strip() for x in provinces_string.split(',')]
        num_of_provinces_list.append(len(result))

max_number_of_provinces = max(num_of_provinces_list)
print("Max number of provinces:",max_number_of_provinces)

for i in range(len(disaster)):
        
    if disaster.provinces.iloc[i] != '' and type(disaster.provinces.iloc[i]) == str:   
        provinces_string = disaster.provinces.iloc[i]
        result = [x.strip() for x in provinces_string.split(',')]        
        disaster.loc[disaster.index==i,'num_provinces'] = len(result)
        
# Determine the number of districts per disaster event to create x columns
num_of_districts_list = []

for i in range(len(disaster)):

    if disaster.districts.iloc[i] != '' and type(disaster.districts.iloc[i]) == str:   
        districts_string = disaster.districts.iloc[i]
        result = [x.strip() for x in districts_string.split(',')]
        num_of_districts_list.append(len(result))

max_number_of_districts = max(num_of_districts_list)
print("Max number of districts:",max_number_of_districts)

for i in range(len(disaster)):
        
    if disaster.districts.iloc[i] != '' and type(disaster.districts.iloc[i]) == str:   
        districts_string = disaster.districts.iloc[i]
        result = [x.strip() for x in districts_string.split(',')]        
        disaster.loc[disaster.index==i,'num_districts'] = len(result)

In [None]:
# Split up entries if n > 1

disaster_single = pd.DataFrame(columns=disaster.columns)
    
for i in range(len(disaster)):
    
    try:       
        disaster_names = [x.strip() for x in disaster.provinces.iloc[i].split(',')]
        disaster_GID_1s = [x.strip() for x in disaster.GID_1.iloc[i].split(',')]

        for j in range(len(disaster_names)):
            disaster_single = disaster_single.append(disaster.iloc[i])
            disaster_single.loc[disaster_single.index==-1,'province_single'] = disaster_names[j]
            disaster_single.loc[disaster_single.index==-1,'GID_1_single'] = disaster_GID_1s[j]

    except:        
        disaster_single = disaster_single.append(disaster.iloc[i])

In [None]:
disaster.to_csv(path_IDMC_geolocation + 'IDMC_2008_2021_provinces_districts.csv')

In [None]:
disaster = pd.read_csv(path_IDMC_geolocation + 'IDMC_2008_2021_provinces_districts.csv')

# Statistics on data

In [3]:
IDMC = pd.read_csv(path_IDMC_geolocation + 'IDMC_2008_2021_provinces_districts.csv')
print("Length IDMC:",len(IDMC))
print("Length IDMC with geolocated entries:",len(IDMC[~IDMC.provinces.isnull() | ~IDMC.districts.isnull()]))
print("Length IDMC with provinces:",len(IDMC[~IDMC.provinces.isnull()]))
print("Length IDMC with districts:",len(IDMC[~IDMC.districts.isnull()]))
print("Length IDMC with provinces and districts:",len(IDMC[~IDMC.provinces.isnull() & ~IDMC.districts.isnull()]))
print("Empty Event Name",len(IDMC[IDMC['event_name'].isnull()]))
print('And Flooding between 2008 and 2018:')
IDMC = IDMC[IDMC['Hazard Type']=='Flood']
IDMC = IDMC[IDMC.Year <= 2018]
print("Length IDMC:",len(IDMC))
print("Length IDMC with geolocated entries:",len(IDMC[~IDMC.provinces.isnull() | ~IDMC.districts.isnull()]))
print("Length IDMC with provinces:",len(IDMC[~IDMC.provinces.isnull()]))
print("Length IDMC with districts:",len(IDMC[~IDMC.districts.isnull()]))
print("Length IDMC with provinces and districts:",len(IDMC[~IDMC.provinces.isnull() & ~IDMC.districts.isnull()]))
print("Empty Event Name",len(IDMC[IDMC['event_name'].isnull()]))

Length IDMC: 11731
Length IDMC with geolocated entries: 7915
Length IDMC with provinces: 5893
Length IDMC with districts: 5740
Length IDMC with provinces and districts: 3718
Empty Event Name 1202
And Flooding between 2008 and 2018:
Length IDMC: 3083
Length IDMC with geolocated entries: 1702
Length IDMC with provinces: 1157
Length IDMC with districts: 1074
Length IDMC with provinces and districts: 529
Empty Event Name 688
