In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [5]:
dataset = pd.read_csv(filepath_or_buffer='population.csv', delimiter= ',')

In [6]:
#### Data Cleaning ####

### 1. Number of null values in Country of Origin, (ISO as well), Country of Asylum, Refugees under UNHCR’s mandate,
co_null  = dataset['Country of origin'].isna().sum()
iso_null = dataset['Country of origin (ISO)'].isna().sum()
ca_null = dataset['Country of asylum'].isna().sum()
ref_null  = dataset['Refugees under UNHCR\'s mandate'].isna().sum()

print("Null Value Exploration")
print()
print(f"Null Values in Country of Origin column: ",co_null)
print(f"Null Values in Country of Origin (ISO) column: ",iso_null)
print(f"Null Values in Country of Asylum column: ",ca_null)
print(f"Null Values in Refugees under UNHCR’s mandate column: ",ref_null)
print("No NULL values!!")

Null Value Exploration

Null Values in Country of Origin column:  0
Null Values in Country of Origin (ISO) column:  0
Null Values in Country of Asylum column:  0
Null Values in Refugees under UNHCR’s mandate column:  0
No NULL values!!


In [7]:
print("Country of Origin column value counts:")
print(dataset['Country of origin'].value_counts())

Country of Origin column value counts:
Country of origin
Unknown                   2888
Somalia                   2607
Iraq                      2465
Dem. Rep. of the Congo    2452
Sudan                     2296
                          ... 
Anguilla                     2
New Caledonia                1
Gibraltar                    1
Martinique                   1
Aruba                        1
Name: count, Length: 214, dtype: int64


In [8]:
print("Country of Origin (ISO) column value counts:")
print(dataset['Country of origin (ISO)'].value_counts())

Country of Origin (ISO) column value counts:
Country of origin (ISO)
UNK    2888
SOM    2607
IRQ    2465
COD    2452
SDN    2296
       ... 
AIA       2
NCL       1
GIB       1
MTQ       1
ABW       1
Name: count, Length: 214, dtype: int64


In [9]:
print("Country of Asylum column value counts")
print(dataset['Country of asylum'].value_counts())

Country of Asylum column value counts
Country of asylum
United States of America                                4730
Canada                                                  4619
Sweden                                                  3541
Germany                                                 3246
United Kingdom of Great Britain and Northern Ireland    3010
                                                        ... 
Anguilla                                                   3
Micronesia (Federated States of)                           3
Montserrat                                                 3
Antigua and Barbuda                                        2
Palau                                                      1
Name: count, Length: 191, dtype: int64


#### Count of Refugees from country (Countries with most refugees produced from 1970-2021):

In [10]:
df_refugeesfrom = dataset.groupby('Country of origin')['Refugees under UNHCR\'s mandate'].sum()
df_refugeesfrom = df_refugeesfrom.sort_values(ascending=False)
print(df_refugeesfrom.head(20))

Country of origin
Afghanistan               142337769
Unknown                    63802915
Syrian Arab Rep.           50869931
Iraq                       28275778
Ethiopia                   24109910
Somalia                    23436355
Sudan                      19421091
Viet Nam                   16478660
Angola                     15237166
Rwanda                     14626607
Dem. Rep. of the Congo     14567102
South Sudan                14544648
Burundi                    14495970
Myanmar                    12332654
Eritrea                    10549878
Mozambique                  9054599
Liberia                     8455276
Bosnia and Herzegovina      8447226
Western Sahara              6157725
Central African Rep.        5881611
Name: Refugees under UNHCR's mandate, dtype: int64


In [11]:
## Afghanistan has the highest number of refugees from 1970 - 2021. A more dynamic implementation will be done in the final project.

In [12]:
df_refugeesfrom = dataset.groupby('Country of origin')['Refugees under UNHCR\'s mandate'].sum().reset_index()
df_refugeesfrom = df_refugeesfrom.sort_values(by= 'Refugees under UNHCR\'s mandate', ascending=False)
fig = px.bar(df_refugeesfrom.head(10), x='Country of origin', y='Refugees under UNHCR\'s mandate', title='Refugees from Country')
fig.show()

In [13]:
df_refugeesto = dataset.groupby('Country of asylum')['Refugees under UNHCR\'s mandate'].sum()
df_refugeesto = df_refugeesto.sort_values(ascending=False)
print(df_refugeesto.head(20))

Country of asylum
Pakistan                                                78274450
Iran (Islamic Rep. of)                                  71481903
Germany                                                 32174193
Türkiye                                                 26333350
Sudan                                                   24366490
United States of America                                24286625
Dem. Rep. of the Congo                                  22220718
United Rep. of Tanzania                                 16404252
Uganda                                                  16253893
Ethiopia                                                15161788
China                                                   12579276
Somalia                                                 11609068
Kenya                                                   10833860
Lebanon                                                 10503156
France                                                   9647103
Jordan 

In [14]:
## The most popualar countries of asylum are Pakistan and Iran????
## This probably makes sense as they border Afghanistan and refugees generally dont have the financial strength to escape to non-neighboring countries.



In [15]:
df_refugeesto = dataset.groupby('Country of asylum')['Refugees under UNHCR\'s mandate'].sum().reset_index()
df_refugeesto = df_refugeesto.sort_values(by= 'Refugees under UNHCR\'s mandate', ascending=False)
fig = px.bar(df_refugeesto.head(10), x='Country of asylum', y='Refugees under UNHCR\'s mandate', title='Countries with most asylum')
fig.show()

### Total Refugees under UNHCR's mandate

In [16]:
df_years = dataset.groupby('Year')['Refugees under UNHCR\'s mandate'].sum().reset_index()
fig = px.line(df_years, x='Year', y='Refugees under UNHCR\'s mandate', title='Refugees under UNHCR\'s Mandate Over Years')
fig.show()


In [None]:
### Total number of refugees have varied over the years.

In [17]:
grouped_data = dataset.groupby(['Country of origin', 'Country of asylum'])['Refugees under UNHCR\'s mandate'].sum().reset_index()
grouped_data = grouped_data.head(200)

all_countries = list(set(grouped_data['Country of origin']).union(set(grouped_data['Country of asylum'])))

In [18]:
country_index = {country: idx for idx, country in enumerate(all_countries)}

source = [country_index[origin] for origin in grouped_data['Country of origin']]
target = [country_index[asylum] for asylum in grouped_data['Country of asylum']]
value = grouped_data['Refugees under UNHCR\'s mandate'].tolist()

In [19]:
import plotly.graph_objects as go

In [20]:
colors = px.colors.qualitative.Plotly

In [21]:
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=all_countries
    ),
    link=dict(
        source=source,
        target=target,
        value=value
    ))])

fig.update_layout(title_text=f"Refugee Flows", font_size=10)
fig.show()