# Data-driven Network Science:
Code for imputing UNHCR data

To run this file, you need the 'unhcr_unimputed.csv' file, which is in the github repo. 'unhcr_unimputed.csv' is the result of running unhcr_cleaning_data.R using the 'population.csv' file. 

## Preliminaries

In [1]:
## libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [11]:
## current directory
%cd /Users/teddyyankov/Library/CloudStorage/OneDrive-Nexus365/Data-Driven Network Science/summative_code

/Users/teddyyankov/Library/CloudStorage/OneDrive-Nexus365/Data-Driven Network Science/summative_code


## Imputing
You need to download unhcr_unimputed.csv from the github repository into your working directory

In [3]:
## load in unimputed data
dat_unimputed = pd.read_csv ("unhcr_unimputed.csv")

In [4]:
## establising imputation minima and maxima
min_val = 0
max_val = 3737369.0

In [5]:
## df copy
dat_imputed = dat_unimputed.copy()
dat_imputed = dat_imputed.drop (columns = ["orig_dest", "orig_dest_year"])

## label encoding
label_encoder = LabelEncoder()
dat_imputed['orig'] = label_encoder.fit_transform (dat_imputed['orig'])
dat_imputed['dest'] = label_encoder.fit_transform (dat_imputed['dest'])

In [6]:
## initialise imputer
imputer = IterativeImputer (random_state = 0,
                            skip_complete = True,
                            sample_posterior = True,
                            min_value = min_val,
                            max_value = max_val)

## transform entire dataset to impute missing values
dat_imputed2 = imputer.fit_transform (dat_imputed)

## update original dataframe with imputed values
dat_imputed.iloc[:, :] = dat_imputed2

In [7]:
## reversing label encoding
dat_imputed2 = dat_imputed.copy()
dat_imputed2 = dat_imputed2.drop (columns = ['orig', 'dest', 'year'])
id_cols = dat_unimputed.iloc[:, :5]
dat_imputed2 = pd.concat ([id_cols, dat_imputed2], axis = 1)
display (dat_imputed2)

Unnamed: 0,orig_dest_year,year,orig_dest,orig,dest,forced_mig
0,CHL_DZA_1978,1978,CHL_DZA,CHL,DZA,2000.0
1,ESH_DZA_1978,1978,ESH_DZA,ESH,DZA,50000.0
2,COD_AGO_1978,1978,COD_AGO,COD,AGO,110550.0
3,NAM_AGO_1978,1978,NAM_AGO,NAM,AGO,30000.0
4,ZAF_AGO_1978,1978,ZAF_AGO,ZAF,AGO,1000.0
...,...,...,...,...,...,...
121542,HTI_AIA_2023,2023,HTI_AIA,HTI,AIA,0.0
121543,VEN_AIA_2023,2023,VEN_AIA,VEN,AIA,0.0
121544,VEN_ABW_2023,2023,VEN_ABW,VEN,ABW,0.0
121545,VEN_CUW_2023,2023,VEN_CUW,VEN,CUW,0.0


In [8]:
## checking specific dyads to see if imputed values make sense
display (dat_imputed2 [dat_imputed2['orig_dest'] == 'MMR_BGD'])

Unnamed: 0,orig_dest_year,year,orig_dest,orig,dest,forced_mig
15,MMR_BGD_1978,1978,MMR_BGD,MMR,BGD,160000.0
3526,MMR_BGD_1991,1991,MMR_BGD,MMR,BGD,40000.0
4107,MMR_BGD_1992,1992,MMR_BGD,MMR,BGD,244994.0
4819,MMR_BGD_1993,1993,MMR_BGD,MMR,BGD,198823.0
5936,MMR_BGD_1994,1994,MMR_BGD,MMR,BGD,116074.0
7264,MMR_BGD_1995,1995,MMR_BGD,MMR,BGD,50985.0
8868,MMR_BGD_1996,1996,MMR_BGD,MMR,BGD,30578.0
10600,MMR_BGD_1997,1997,MMR_BGD,MMR,BGD,21497.0
12653,MMR_BGD_1998,1998,MMR_BGD,MMR,BGD,22174.0
14952,MMR_BGD_1999,1999,MMR_BGD,MMR,BGD,22131.0


In [9]:
## sample of imputed data
dat_imputed.sample (10)['forced_mig']

25439       5.0
117010     25.0
25528      25.0
15556     199.0
113340     16.0
12983      14.0
111939     20.0
68458      16.0
102857     22.0
44587      11.0
Name: forced_mig, dtype: float64

In [10]:
## saving results
dat_imputed2.to_csv('dat_imputed.csv', index = False)