# Data preprocessing to load into Neo4j

In [1]:
import pandas as pd
import os
import csv

## Custom Functions

In [2]:
def get_clean_list_of_files(paht):
    
    '''
    
    '''
    
    file_list = [file for file in os.listdir(paht) if not file.startswith(".")]
    file_list = [file for file in file_list if file.endswith(".csv")]
    return file_list

In [3]:
def countries_nodes(path_file_out, countries):
    
    '''
    
    '''
    
    with open(path_file_out, 'w') as f:
        
        csv.writer(f).writerow(["countryId:ID(Country-ID)",":LABEL"])

        for item in countries:
            csv.writer(f).writerow([item , "Country"])

In [4]:
def countries_relationships(path_file_out, countries_years):
    
    '''
    
    '''
    
    with open(path_file_out, 'w') as f:
        
        csv.writer(f).writerow([":START_ID(Country-ID)",":END_ID(CountryYear-ID)",":TYPE"])
        
        for item in countries_years:
            csv.writer(f).writerow([item[0:-4], item, item[-4:]])


In [5]:
def process_relationships_neo4j(file_list, column_list, path_in, path_file_out, type_relationship, WRITE_DATA = False):
    
    '''
    
    '''
    
    countries_years = set()
    countries = set()
    
    if os.path.exists(path_file_out) and WRITE_DATA:
        os.remove(path_file_out)
    
    with open(path_file_out, 'a') as f:
        
        if WRITE_DATA:
            
            first_line_file = [":START_ID(CountryYear-ID)",":END_ID(CountryYear-ID)","affected_number:int",":TYPE"]
            csv.writer(f).writerow(first_line_file)
    
        for file_in in file_list:
        
            df = pd.read_csv(path_in + file_in, skiprows=1, na_values=["*","0.0","Various/Unknown","Stateless"])
            df = df[['#country+residence', '#country+origin', '#date+year', "#affected+total"]]
            df = df.dropna(axis=0, how='any') # remove rows with missing data            
            df['#affected+total'] = df['#affected+total'].astype(int)    
            df["CountryYear+origin"] = df["#country+origin"] + df['#date+year'].astype(str)
            df["CountryYear+residence"] = df["#country+residence"] + df['#date+year'].astype(str)
            
            
            countries.update(df['#country+residence'].unique())
            countries.update(df['#country+origin'].unique())
            
            countries_years.update(df["CountryYear+origin"].unique())
            countries_years.update(df["CountryYear+residence"].unique())
                        
            df = df[column_list]
            
            df["label"] = type_relationship
            
            if WRITE_DATA:
                df.to_csv(f, header=False, index=False)
        
    return countries, countries_years

In [6]:
def process_nodes_neo4j(file_list, path_in, path_file_out, countries_set, countries_years_set, first_line, index_list, WRITE_DATA = False):
    
    '''
    
    '''
    
    countries_years = set()

    
    if os.path.exists(path_file_out) and WRITE_DATA:
        os.remove(path_file_out)
    
    with open(path_file_out, 'a') as f:
        
        if WRITE_DATA:

            csv.writer(f).writerow(first_line)
        
        for file_in in file_list:
        
            df = pd.read_csv(path_in + file_in, skiprows=1, na_values=[""])
            
            country = df['#country+name'].unique()[0]
            
            if country in countries_set:
                                
                df = df.dropna(axis=0, how='any') # remove rows with missing data
                                
                df = df.pivot_table('#indicator+num', ['#country+name','#date+year'], '#indicator+name')
                df.reset_index(inplace=True)
                
                df["CountryYear"] = df["#country+name"] + df['#date+year'].astype(str)
                
                df = df[index_list]
                
                df_final = df[df["CountryYear"].isin(countries_years_set)]
                
                countries_years.update(df["CountryYear"].unique())
                
                df["label"] = "CountryYear"

                if WRITE_DATA:
                    df_final.to_csv(f, header=False, index=False)
                    
        nodes_only_in_relationships_file = countries_years_set - countries_years
        
        for node in nodes_only_in_relationships_file:
            
            csv.writer(f).writerow([node, node[0:-4], node[-4:], "", "", "", "", "CountryYear"])

## Common Variables

In [7]:
path_out = "../data/processed/neo4j/"
WRITE_DATA_FLAG = True

## People of concern. Residing
#### Relationship : **RESIDE_IN**

Define variables

In [8]:
path_in_residing = "../data/raw/residing/"
file_out_residing_relationships = path_out + 'relationships_residing.csv'
file_out_residing_countries = path_out + 'countries_residing.csv'

In [9]:
column_list_residing = ["CountryYear+origin", "CountryYear+residence", "#affected+total"]

Define a list with all files located in the residing folder:

In [10]:
residing_list = get_clean_list_of_files(path_in_residing)
residing_list[0:4]

['refugees-residing-lie.csv',
 'refugees-residing-png.csv',
 'refugees-residing-fin.csv',
 'refugees-residing-cyp.csv']

Call function to process raw data and generate the required csv file to load into neo4j:

In [11]:
countries_residing, countries_years_residing = process_relationships_neo4j(residing_list, 
                                                                           column_list_residing, 
                                                                           path_in_residing, 
                                                                           file_out_residing_relationships,
                                                                           "RESIDE_IN",
                                                                           WRITE_DATA_FLAG)

In [12]:
len(countries_residing)

222

In [13]:
len(countries_years_residing)

7154

In [14]:
if WRITE_DATA_FLAG:
    countries_nodes(file_out_residing_countries, countries_residing)

In [15]:
if WRITE_DATA_FLAG:
    countries_relationships(path_out + 'countries_years_relationship_residing.csv', countries_years_residing)

## People of concern. Originating
#### Relationship : **ORIGINATE_FROM**

Define variables

In [16]:
path_in_originating = "../data/raw/originating/"
file_out_originating_relationships = path_out + 'relationships_originating.csv'
file_out_originating_countries = path_out + 'countries_originating.csv'

In [17]:
column_list_originating = ["CountryYear+residence", "CountryYear+origin",  "#affected+total"]

Define a list with all files located in the originating folder:

In [18]:
originating_list = get_clean_list_of_files(path_in_originating)
originating_list[0:4]

['refugees-originating-prt.csv',
 'refugees-originating-nzl.csv',
 'refugees-originating-khm.csv',
 'refugees-originating-isr.csv']

Call function to process raw data and generate the required csv file to load into neo4j:

In [19]:
countries_originating, countries_years_originating = process_relationships_neo4j(originating_list, 
                                                                                 column_list_originating, 
                                                                                 path_in_originating, 
                                                                                 file_out_originating_relationships,
                                                                                 "ORIGINATE_FROM",
                                                                                 WRITE_DATA_FLAG)

In [20]:
len(countries_originating)

198

In [21]:
len(countries_years_originating)

6689

In [22]:
if WRITE_DATA_FLAG:
    countries_nodes(file_out_originating_countries, countries_originating)

In [23]:
if WRITE_DATA_FLAG:
    countries_relationships(path_out + 'countries_years_relationship_originating.csv', countries_years_originating)

## World Bank Indicators. Nodes properties

Define variables

In [24]:
path_in_indicators = "../data/raw/indicators/"
file_out_indicators_residing = path_out + 'countries_nodes_residing.csv'
file_out_indicators_originating = path_out + 'countries_nodes_originating.csv'

Define a list with all files located in the indicators folder:

In [25]:
indicators_list = get_clean_list_of_files(path_in_indicators)
indicators_list[0:4]

['world-bank-indicators-for-netherlands.csv',
 'world-bank-indicators-for-korea-rep.csv',
 'world-bank-indicators-for-antigua-and-barbuda.csv',
 'world-bank-indicators-for-gambia-the.csv']

Define a list with the indicators to use:

In [26]:
index_list = ["CountryYear",
              '#country+name',
              '#date+year',
              "Population, total", 
              "International migrant stock (% of population)", 
              "Population growth (annual %)",
              "Urban population (% of total)"]

In [27]:
first_line_file = ["countryearId:ID(CountryYear-ID)",
                   "country",
                   "year:int",
                   "population:float", 
                   "int_migrant_stock:float", 
                   "pop_growth_percentage:float",
                   "urban_pop_percentage:float",
                   ":LABEL"]

Call function to process raw data and generate the required csv file to load into neo4j:

### Residing

In [28]:
process_nodes_neo4j(indicators_list, 
                    path_in_indicators, 
                    file_out_indicators_residing, 
                    countries_residing,
                    countries_years_residing,
                    first_line_file,
                    index_list,
                    WRITE_DATA_FLAG)

### Originating

In [29]:
process_nodes_neo4j(indicators_list, 
                    path_in_indicators, 
                    file_out_indicators_originating, 
                    countries_originating,
                    countries_years_originating,
                    first_line_file,
                    index_list,
                    WRITE_DATA_FLAG)