# Data preprocessing to load into Neo4j

In [69]:
import pandas as pd
import os
import csv

## Custom Functions

In [70]:
def get_clean_list_of_files(paht):
    file_list = [file for file in os.listdir(paht) if not file.startswith(".")]
    file_list = [file for file in file_list if file.endswith(".csv")]
    return file_list

In [71]:
def process_relationships_neo4j(file_list, column_list, path_in, path_file_out, WRITE_DATA = False):
    
    countries = set()
    
    if os.path.exists(path_file_out) and WRITE_DATA:
        os.remove(path_file_out)
    
    with open(path_file_out, 'a') as f:
        
        first_line_file = [":START_ID(Country)",":END_ID(Country)","year:int","affected_number:float"]
        csv.writer(f).writerow(first_line_file)
    
        for file_in in file_list:
        
            df = pd.read_csv(path_in + file_in, skiprows=1, na_values=["*","0.0"])
            df = df[column_list]
            df = df.dropna(axis=0, how='any') # remove rows with missing data
            df['#affected+total'] = df['#affected+total'].astype(int)
            
            countries.update(df['#country+residence'].unique())
            countries.update(df['#country+origin'].unique())
            
            if WRITE_DATA:
                df.to_csv(f, header=False, index=False)
        
    return countries

In [94]:
def process_nodes_neo4j(file_list, path_in, path_file_out, countries_list, WRITE_DATA = False):
    
    if os.path.exists(path_file_out) and WRITE_DATA:
        os.remove(path_file_out)
    
    with open(path_file_out, 'a') as f:
        
        first_line_file = ["countryId:ID(Country)","year:int", "population:int", "int_migrant_stock", "pop_growth", "GDP_dollars", "inflation_percentage"]
        csv.writer(f).writerow(first_line_file)
    
        for file_in in file_list:
        
            df = pd.read_csv(path_in + file_in, skiprows=1, na_values=[""])
            
            country = df['#country+name'].unique()[0]
            
            if country in countries_list:
                
                df = df.dropna(axis=0, how='any') # remove rows with missing data
                
                return df
                
                
                
                
                if WRITE_DATA:
                    df_final.to_csv(f, header=False, index=False)
        
    return countries

## Common Variables

In [73]:
path_out = "../data/processed/neo4j/"
WRITE_DATA_FLAG = True

## People of concern. Residing
#### Relationship : **RESIDE_IN**

Define variables

In [74]:
path_in_residing = "../data/raw/residing/"
file_out_residing = path_out + 'relationsips_residing.csv'

In [75]:
column_list_residing = ['#country+origin', '#country+residence', '#date+year', "#affected+total"]

Define a list with all files located in the residing folder:

In [76]:
residing_list = get_clean_list_of_files(path_in_residing)
residing_list[0:4]

['refugees-residing-lie.csv',
 'refugees-residing-png.csv',
 'refugees-residing-fin.csv',
 'refugees-residing-cyp.csv']

Call function to process raw data and generate the required csv file to load into neo4j:

In [77]:
countries_residing = process_relationships_neo4j(residing_list, column_list_residing, path_in_residing, file_out_residing, WRITE_DATA_FLAG)

In [83]:
len(countries_residing)

224

In [None]:
with open(path_out + 'countries_residing.csv', 'w') as f:
    f.write("countryId:ID(Country)\n")
    for item in countries:
        f.write('"' + item + '"\n')

## People of concern. Originating
#### Relationship : **ORIGINATE_FROM**

Define variables

In [78]:
path_in_originating = "../data/raw/originating/"
file_out_originating = path_out + 'relationsips_originating.csv'

In [79]:
column_list_originating = ['#country+residence', '#country+origin', '#date+year', "#affected+total"]

Define a list with all files located in the originating folder:

In [80]:
originating_list = get_clean_list_of_files(path_in_originating)
originating_list[0:4]

['refugees-originating-prt.csv',
 'refugees-originating-nzl.csv',
 'refugees-originating-khm.csv',
 'refugees-originating-isr.csv']

Call function to process raw data and generate the required csv file to load into neo4j:

In [82]:
countries_originating = process_relationships_neo4j(originating_list, column_list_originating, path_in_originating, file_out_originating, WRITE_DATA_FLAG)

In [84]:
len(countries_originating)

199

In [None]:
with open(path_out + 'countries_originating.csv', 'w') as f:
    f.write("countryId:ID(Country)\n")
    for item in countries:
        f.write('"' + item + '"\n')

## World Bank Indicators

Define variables

In [88]:
path_in_indicators = "../data/raw/indicators/"
file_out_indicators_residing = path_out + 'countries_nodes_residing.csv'
file_out_indicators_originating = path_out + 'countries_nodes_originating.csv'

Define a list with all files located in the indicators folder:

In [89]:
indicators_list = get_clean_list_of_files(path_in_indicators)
indicators_list[0:4]

['world-bank-indicators-for-netherlands.csv',
 'world-bank-indicators-for-korea-rep.csv',
 'world-bank-indicators-for-antigua-and-barbuda.csv',
 'world-bank-indicators-for-gambia-the.csv']

Call function to process raw data and generate the required csv file to load into neo4j:

In [98]:
countries_indicators_residing = process_nodes_neo4j(indicators_list[0:1], path_in_indicators, file_out_indicators_residing, countries_residing, WRITE_DATA_FLAG)

In [99]:
countries_indicators_residing

Unnamed: 0,#country+name,#country+code,#date+year,#indicator+name,#indicator+code,#indicator+num
0,Netherlands,NLD,2017,"Population, total",SP.POP.TOTL,1.713285e+07
1,Netherlands,NLD,2016,"Population, total",SP.POP.TOTL,1.703031e+07
2,Netherlands,NLD,2015,"Population, total",SP.POP.TOTL,1.693992e+07
3,Netherlands,NLD,2014,"Population, total",SP.POP.TOTL,1.686501e+07
4,Netherlands,NLD,2013,"Population, total",SP.POP.TOTL,1.680443e+07
5,Netherlands,NLD,2012,"Population, total",SP.POP.TOTL,1.675496e+07
6,Netherlands,NLD,2011,"Population, total",SP.POP.TOTL,1.669307e+07
7,Netherlands,NLD,2010,"Population, total",SP.POP.TOTL,1.661539e+07
8,Netherlands,NLD,2009,"Population, total",SP.POP.TOTL,1.653039e+07
9,Netherlands,NLD,2008,"Population, total",SP.POP.TOTL,1.644559e+07


In [124]:
index_list = ["Population, total", 
              "International migrant stock (% of population)", 
              "Population growth (annual %)", 
              "GDP per capita, PPP (current international $)",
              "GINI index (World Bank estimate)",
              "Urban population (% of total)"]

In [125]:
df = countries_indicators_residing.pivot_table('#indicator+num', ['#country+name','#date+year'], '#indicator+name')

In [116]:
index_list.insert(0, '#country+name')

In [117]:
index_list.insert(1, '#date+year')

In [126]:
index_list

['Population, total',
 'International migrant stock (% of population)',
 'Population growth (annual %)',
 'GDP per capita, PPP (current international $)',
 'GINI index (World Bank estimate)',
 'Urban population (% of total)']

In [127]:
df = df[index_list]

In [128]:
df

Unnamed: 0_level_0,#indicator+name,"Population, total",International migrant stock (% of population),Population growth (annual %),"GDP per capita, PPP (current international $)",GINI index (World Bank estimate),Urban population (% of total)
#country+name,#date+year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Netherlands,1960,11486631.0,,1.952179,,,59.752
Netherlands,1961,11638712.0,,1.315295,,,60.02
Netherlands,1962,11805689.0,,1.424475,,,60.204
Netherlands,1963,11965966.0,,1.348492,,,60.387
Netherlands,1964,12127120.0,,1.337781,,,60.57
Netherlands,1965,12294732.0,,1.372661,,,60.753
Netherlands,1966,12456251.0,,1.305171,,,60.936
Netherlands,1967,12598201.0,,1.133144,,,61.118
Netherlands,1968,12729721.0,,1.038547,,,61.3
Netherlands,1969,12877984.0,,1.157969,,,61.481


In [None]:
index_list = ["Population, total", 
              "International migrant stock (% of population)", 
              "Population growth (annual %)", 
              "GDP per capita, PPP (current international $)",
              "GINI index (World Bank estimate)",
              "Urban population (% of total)"]

In [137]:
df['New_ID'] = range(0, 0+len(df))

In [138]:
df

Unnamed: 0_level_0,#indicator+name,"Population, total",International migrant stock (% of population),Population growth (annual %),"GDP per capita, PPP (current international $)",GINI index (World Bank estimate),Urban population (% of total),New_ID
#country+name,#date+year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Netherlands,1960,11486631,,1.952179,,,59.752,0
Netherlands,1961,11638712,,1.315295,,,60.02,1
Netherlands,1962,11805689,,1.424475,,,60.204,2
Netherlands,1963,11965966,,1.348492,,,60.387,3
Netherlands,1964,12127120,,1.337781,,,60.57,4
Netherlands,1965,12294732,,1.372661,,,60.753,5
Netherlands,1966,12456251,,1.305171,,,60.936,6
Netherlands,1967,12598201,,1.133144,,,61.118,7
Netherlands,1968,12729721,,1.038547,,,61.3,8
Netherlands,1969,12877984,,1.157969,,,61.481,9


In [139]:
with open(path_out + 'countries_residing.csv', 'w') as f:
    
    first_line_file = ["country:LABEL(Country)",
                       "year:int", 
                       "population:int", 
                       "int_migrant_stock:float", 
                       "pop_growth_percentage:float", 
                       "GDP_dollars:float", 
                       "GINI:float", 
                       "urban_pop_percentage:float",
                       ":ID"]
    csv.writer(f).writerow(first_line_file)
    df['Population, total'] = df['Population, total'].astype(int)
    df.to_csv(f, header=False)

In [136]:
countries_indicators_originating = process_nodes_neo4j(indicators_list, path_in_indicators, file_out_indicators_originating, countries_originating, WRITE_DATA_FLAG)