# Data Preparation

In [73]:
# Import necessary libraries

%matplotlib inline

import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import plotly.express as px

import plotly.io as pio
pio.renderers.default = 'notebook'

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

In [74]:
# Load data

kidnap_count = pd.read_csv('/home/muhammad/global-kidnap/Data/National Kidnapping.csv')
kidnap_rate = pd.read_csv('./Data/National Kidnap Rate.csv')
population = pd.read_csv('./Data/Population.csv')

# Check the first five rows of the life expectancy dataset
kidnap_count.head()

Unnamed: 0,location,variable,Unit,Date,Value
0,Rwanda,Kidnapping at the National Level Count,Number,2008,8
1,Rwanda,Kidnapping at the National Level Count,Number,2009,8
2,Rwanda,Kidnapping at the National Level Count,Number,2010,12
3,Rwanda,Kidnapping at the National Level Count,Number,2011,12
4,Rwanda,Kidnapping at the National Level Count,Number,2012,10


In [75]:
# Check the number of rows

kidnap_count.shape

(1435, 5)

From the above, we found that there are **266** rows (records) and **66** attributes (columns) in df_lif_exp dataframe

In [76]:
# Check the first 5 rows in the df_popu dataframe

kidnap_rate.head()

Unnamed: 0,location,variable,Unit,Date,Value
0,Rwanda,Kidnapping at the National Level Rate,"rates per 100,000 population",2008,0.084
1,Rwanda,Kidnapping at the National Level Rate,"rates per 100,000 population",2009,0.082
2,Rwanda,Kidnapping at the National Level Rate,"rates per 100,000 population",2010,0.12
3,Rwanda,Kidnapping at the National Level Rate,"rates per 100,000 population",2011,0.117
4,Rwanda,Kidnapping at the National Level Rate,"rates per 100,000 population",2012,0.095


In [77]:
# Check the number of rows

kidnap_rate.shape

(1435, 5)

In [78]:
population.head()

Unnamed: 0,country,series,Unit,Date,Value
0,American Samoa,"Population, total",Number,1960,20127
1,American Samoa,"Population, total",Number,1961,20605
2,American Samoa,"Population, total",Number,1962,21246
3,American Samoa,"Population, total",Number,1963,22029
4,American Samoa,"Population, total",Number,1964,22850


In [79]:
population.shape

(13717, 5)

From the above, we found that there are **266** rows (records) and **66** attributes (columns) in df_popu dataframe

In [80]:
# Check for available datasets in the GeoPandas dataset function

gpd.datasets.available

['naturalearth_lowres', 'naturalearth_cities', 'nybb']

In [81]:
# Read in the 'naturalearth_lowres' dataset

world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
world.head()

Unnamed: 0,pop_est,continent,name,iso_a3,gdp_md_est,geometry
0,920938,Oceania,Fiji,FJI,8374.0,"MULTIPOLYGON (((180.00000 -16.06713, 180.00000..."
1,53950935,Africa,Tanzania,TZA,150600.0,"POLYGON ((33.90371 -0.95000, 34.07262 -1.05982..."
2,603253,Africa,W. Sahara,ESH,906.5,"POLYGON ((-8.66559 27.65643, -8.66512 27.58948..."
3,35623680,North America,Canada,CAN,1674000.0,"MULTIPOLYGON (((-122.84000 49.00000, -122.9742..."
4,326625791,North America,United States of America,USA,18560000.0,"MULTIPOLYGON (((-122.84000 49.00000, -120.0000..."


In [82]:
# Check the number of rows

world.shape

(177, 6)

From the above, we found that there are **177** rows (records) and **6** attributes (columns) in the 'world' dataframe

In [83]:
# Create a new dataframe from 'world' picking only ['name', 'iso_a3', 'geometry']

df_world = world[['name', 'iso_a3', 'geometry', 'continent']]
df_world.head()

Unnamed: 0,name,iso_a3,geometry,continent
0,Fiji,FJI,"MULTIPOLYGON (((180.00000 -16.06713, 180.00000...",Oceania
1,Tanzania,TZA,"POLYGON ((33.90371 -0.95000, 34.07262 -1.05982...",Africa
2,W. Sahara,ESH,"POLYGON ((-8.66559 27.65643, -8.66512 27.58948...",Africa
3,Canada,CAN,"MULTIPOLYGON (((-122.84000 49.00000, -122.9742...",North America
4,United States of America,USA,"MULTIPOLYGON (((-122.84000 49.00000, -120.0000...",North America


In [84]:
# Rename 'iso_a3' and 'geometry' columns

df_world.rename(columns={'name':'country', 'iso_a3':'Country Code', 'geometry':'Geometry', 'continent':'Continent'}, inplace=True)

In [85]:
df_world = df_world[['country', 'Continent']]
df_world.head()

Unnamed: 0,country,Continent
0,Fiji,Oceania
1,Tanzania,Africa
2,W. Sahara,Africa
3,Canada,North America
4,United States of America,North America


In [86]:
# Merge the world dataset on life expectany. This picks only countries (in the life expectancy dataframe) 
# that have matching country code (in the world dataframe)

kidnap = kidnap_rate.merge(kidnap_count, on=['location', 'Date'])
kidnap.head()

Unnamed: 0,location,variable_x,Unit_x,Date,Value_x,variable_y,Unit_y,Value_y
0,Rwanda,Kidnapping at the National Level Rate,"rates per 100,000 population",2008,0.084,Kidnapping at the National Level Count,Number,8
1,Rwanda,Kidnapping at the National Level Rate,"rates per 100,000 population",2009,0.082,Kidnapping at the National Level Count,Number,8
2,Rwanda,Kidnapping at the National Level Rate,"rates per 100,000 population",2010,0.12,Kidnapping at the National Level Count,Number,12
3,Rwanda,Kidnapping at the National Level Rate,"rates per 100,000 population",2011,0.117,Kidnapping at the National Level Count,Number,12
4,Rwanda,Kidnapping at the National Level Rate,"rates per 100,000 population",2012,0.095,Kidnapping at the National Level Count,Number,10


In [87]:
kidnap = kidnap[['location', 'Date', 'Value_x', 'Value_y']]
kidnap.rename(columns={'location':'country', 'Value_x':'Kidnapping Rate', 'Value_y':'Kidnapping Count'}, inplace=True)        
kidnap.head()

Unnamed: 0,country,Date,Kidnapping Rate,Kidnapping Count
0,Rwanda,2008,0.084,8
1,Rwanda,2009,0.082,8
2,Rwanda,2010,0.12,12
3,Rwanda,2011,0.117,12
4,Rwanda,2012,0.095,10


In [88]:
kidnap = kidnap.merge(population, on=['country', 'Date'])
kidnap.head()

Unnamed: 0,country,Date,Kidnapping Rate,Kidnapping Count,series,Unit,Value
0,Rwanda,2008,0.084,8,"Population, total",Number,9524532
1,Rwanda,2009,0.082,8,"Population, total",Number,9782770
2,Rwanda,2010,0.12,12,"Population, total",Number,10039338
3,Rwanda,2011,0.117,12,"Population, total",Number,10293333
4,Rwanda,2012,0.095,10,"Population, total",Number,10549668


In [89]:
kidnap = kidnap[['country', 'Date', 'Kidnapping Rate', 'Kidnapping Count', 'Value']]
kidnap.rename(columns={'Value':'Population', 'Date':'Year'}, inplace=True)
kidnap.head()

Unnamed: 0,country,Year,Kidnapping Rate,Kidnapping Count,Population
0,Rwanda,2008,0.084,8,9524532
1,Rwanda,2009,0.082,8,9782770
2,Rwanda,2010,0.12,12,10039338
3,Rwanda,2011,0.117,12,10293333
4,Rwanda,2012,0.095,10,10549668


In [90]:
kidnap.shape

(1212, 5)

In [91]:
# Merge the world dataset on life expectany. This picks only countries (in the population dataframe) 
# that have matching country code (in the world dataframe)

kidnap = df_world.merge(kidnap, on='country')
kidnap.head()

Unnamed: 0,country,Continent,Year,Kidnapping Rate,Kidnapping Count,Population
0,Canada,North America,2003,10.156,3198,31644028
1,Canada,North America,2004,10.866,3457,31940655
2,Canada,North America,2005,12.113,3896,32243753
3,Canada,North America,2006,13.794,4488,32571174
4,Canada,North America,2007,14.151,4660,32889025


In [92]:
kidnap['kidnap_%_Change'] = kidnap['Kidnapping Count'].pct_change()
kidnap['kidnap_rate_%_Change'] = kidnap['Kidnapping Rate'].pct_change()
kidnap.head()

Unnamed: 0,country,Continent,Year,Kidnapping Rate,Kidnapping Count,Population,kidnap_%_Change,kidnap_rate_%_Change
0,Canada,North America,2003,10.156,3198,31644028,,
1,Canada,North America,2004,10.866,3457,31940655,0.080988,0.069909
2,Canada,North America,2005,12.113,3896,32243753,0.126989,0.114762
3,Canada,North America,2006,13.794,4488,32571174,0.151951,0.138777
4,Canada,North America,2007,14.151,4660,32889025,0.038324,0.025881


In [93]:
# Check for the number of rows

kidnap.shape

(1023, 8)

In [94]:
kidnap.isnull().sum()

country                  0
Continent                0
Year                     0
Kidnapping Rate          0
Kidnapping Count         0
Population               0
kidnap_%_Change         19
kidnap_rate_%_Change    19
dtype: int64

In [105]:
kidnap.nunique(dropna=True)

country                   98
Continent                  6
Year                      16
Kidnapping Rate          777
Kidnapping Count         457
Population              1023
kidnap_%_Change          744
kidnap_rate_%_Change     926
dtype: int64

In [119]:
# Create a new dataframe containing the average life expectancy for each country between 2000 and 2010

df_kidnap = kidnap[['country', 'Continent', 'Kidnapping Rate', 'Kidnapping Count', 'Population',
                    'kidnap_%_Change', 'kidnap_rate_%_Change']].groupby('country',
                as_index=False).mean()
df_kidnap.head()

Unnamed: 0,country,Kidnapping Rate,Kidnapping Count,Population,kidnap_%_Change,kidnap_rate_%_Change
0,Albania,0.207231,6.153846,2910527.0,0.207072,0.334082
1,Algeria,0.631154,227.076923,35599470.0,1.660614,1.268127
2,Armenia,1.280333,37.4,2924995.0,0.026419,0.034432
3,Australia,2.399556,562.333333,23477830.0,0.005676,-0.125083
4,Austria,0.06,5.066667,8457161.0,0.225729,0.215815


From the above, we found that there are **98 unique countries**, the countries in the dataset are spread across **6 continents** and the data spans over **16 years**.

In [125]:
fig = px.bar(kidnap, x='kidnap_%_Change', y='country',
             hover_data=['Population', 'Kidnapping Rate', ],  
             color='Kidnapping Count', animation_frame='Year',
             labels={'Percentage Change':'Percentage Change in Life Expectancy'}, height=400)
fig.show()

In [122]:
df_kidnap.nlargest(5, 'Kidnapping Rate')

Unnamed: 0,country,Kidnapping Rate,Kidnapping Count,Population,kidnap_%_Change,kidnap_rate_%_Change
50,Lebanon,33.506833,1925.444444,5632389.0,1.347856,1.652004
90,Turkey,16.96934,16498.428571,72358270.0,7.540374,1.16683
48,Kuwait,12.523167,309.5,2470905.0,7.791248,2.543802
17,Canada,11.495867,3907.133333,34006740.0,0.015365,0.004266
8,Belgium,9.9396,1087.4,10889740.0,1.802403,0.046529


In [123]:
df_kidnap.nsmallest(5, 'Kidnapping Rate')

Unnamed: 0,country,Kidnapping Rate,Kidnapping Count,Population,kidnap_%_Change,kidnap_rate_%_Change
84,Sudan,0.00616,616.0,33060840.0,55.0,-0.706667
78,Senegal,0.014,2.0,14578450.0,0.0,-0.854167
54,Madagascar,0.023167,5.5,22671250.0,,
59,Myanmar,0.024143,12.428571,52281690.0,,
26,Finland,0.030062,1.625,5372450.0,,


In [124]:
# Choropleth Map of Life Expectancy for Countries with Year ranging from 2000 to 2010
fig = px.choropleth(kidnap,locations='country',locationmode='country names', 
                    color='Kidnapping Rate', color_continuous_scale='Viridis', animation_frame='Year',
                    hover_data=['Kidnapping Count', 'Population', 'Continent'])
fig.update_layout(title='Choropleth Map of Life Expectancy for Countries with Year ranging from 2000 to 2010',
                  template="simple_white")
fig.show()

In [61]:
africa = kidnap[kidnap['Continent'] == 'Africa']
africa.head()

Unnamed: 0,country,Continent,Year,Kidnapping Rate,Kidnapping Count,Population
58,Kenya,Africa,2004,0.659,235,35635267
59,Kenya,Africa,2005,0.415,152,36624897
60,Kenya,Africa,2006,0.584,220,37649039
61,Kenya,Africa,2007,0.447,173,38705934
62,Kenya,Africa,2008,0.183,73,39791984


In [62]:
africa.nunique(dropna=True)

country              18
Continent             1
Year                 16
Kidnapping Rate     107
Kidnapping Count     98
Population          112
dtype: int64

In [63]:
# Choropleth Map of Life Expectancy for Countries with Year ranging from 2000 to 2010
fig = px.choropleth(africa,locations='country',locationmode='country names', 
                    color='Kidnapping Count', color_continuous_scale='Viridis', animation_frame='Year',
                    hover_data=['Kidnapping Rate', 'Population', 'Continent'])
fig.update_layout(title='Choropleth Map of Life Expectancy for Countries with Year ranging from 2000 to 2010',
                  template="simple_white")
fig.show()

## Task 1.2.2:

In [64]:
# Create a new dataframe for countires having population greater than 295,516,599

europe = kidnap[kidnap['Continent']=='Europe']
europe.head()

Unnamed: 0,country,Continent,Year,Kidnapping Rate,Kidnapping Count,Population
235,France,Europe,2003,3.371,2031,62244880
236,France,Europe,2004,3.531,2143,62704901
237,France,Europe,2005,3.292,2012,63179356
238,France,Europe,2006,3.72,2288,63621376
239,France,Europe,2007,3.409,2109,64016227


In [65]:
# Choropleth Map of Life Expectancy for Countries with Year ranging from 2000 to 2010
fig = px.choropleth(europe,locations='country',locationmode='country names', 
                    color='Kidnapping Count', color_continuous_scale='Viridis', animation_frame='Year',
                    hover_data=['Kidnapping Rate', 'Population', 'Continent'])
fig.update_layout(title='Choropleth Map of Life Expectancy for Countries with Year ranging from 2000 to 2010',
                  template="simple_white")
fig.show()

## Task 1.2.3: 

In [184]:
population_2005_3 = df_merged_data[df_merged_data['Population'].between(63179356, 1147609924)]
population_2005_3.head()

Unnamed: 0,Country Name,Population,Life Expectancy
3,United States,295516599.0,75.0
7,Indonesia,226289468.0,65.579
17,Russian Federation,143518814.0,58.92
23,Mexico,106005199.0,72.575
25,Brazil,186127108.0,68.166


In [185]:
population_2005_3.shape

(18, 3)

In [225]:
# Choropleth Map of the World
fig = px.choropleth(population_2005_3,locations='Country Name',locationmode='country names',
                    color='Life Expectancy', color_continuous_scale='Viridis',)
fig.update_layout(title='Choropleth Map of Life Expectancy for Countries having population between 63179356 and 1147609924 in the year 2005',
                  template="simple_white")
fig.show()

## Task 1.2.4:

In [187]:
# Check for the country with the highest population in the year 2005

def find_max_country(df_merged_data):
    return df_merged_data[df_merged_data['Population'] == max(df_merged_data['Population'])].index[0]

find_max_country(df_merged_data)

print(df_merged_data.loc[find_max_country(df_merged_data)])

Country Name              China
Population         1303720000.0
Life Expectancy          71.297
Name: 134, dtype: object


In [188]:
# Create a new dataframe for the country with the highest population in the year 2005 
# (taking into consideration records from 2000 to 2010)

df_China = final_data[final_data['Country Name']=='China']
df_China.head()

Unnamed: 0,Country Name,Country Code,Geometry,Year,Life Expectancy,Population
134,China,CHN,"MULTIPOLYGON (((109.47521 18.19770, 108.65521 ...",2000,69.595,1262645000.0
301,China,CHN,"MULTIPOLYGON (((109.47521 18.19770, 108.65521 ...",2001,69.981,1271850000.0
468,China,CHN,"MULTIPOLYGON (((109.47521 18.19770, 108.65521 ...",2002,70.351,1280400000.0
635,China,CHN,"MULTIPOLYGON (((109.47521 18.19770, 108.65521 ...",2003,70.697,1288400000.0
802,China,CHN,"MULTIPOLYGON (((109.47521 18.19770, 108.65521 ...",2004,71.013,1296075000.0


In [190]:
df_China.tail()

Unnamed: 0,Country Name,Country Code,Geometry,Year,Life Expectancy,Population
1637,China,CHN,"MULTIPOLYGON (((109.47521 18.19770, 108.65521 ...",2006,71.551,1311020000.0
1804,China,CHN,"MULTIPOLYGON (((109.47521 18.19770, 108.65521 ...",2007,71.788,1317885000.0
1971,China,CHN,"MULTIPOLYGON (((109.47521 18.19770, 108.65521 ...",2008,72.017,1324655000.0
2138,China,CHN,"MULTIPOLYGON (((109.47521 18.19770, 108.65521 ...",2009,72.244,1331260000.0
2305,China,CHN,"MULTIPOLYGON (((109.47521 18.19770, 108.65521 ...",2010,72.476,1337705000.0


In [192]:
df_China.nunique(dropna=True)

Country Name        1
Country Code        1
Geometry            1
Year               11
Life Expectancy    11
Population         11
dtype: int64

In [193]:
df_China['Percentage Change'] = df_China['Life Expectancy'].pct_change()
df_China.head()

Unnamed: 0,Country Name,Country Code,Geometry,Year,Life Expectancy,Population,Percentage Change
134,China,CHN,"MULTIPOLYGON (((109.47521 18.19770, 108.65521 ...",2000,69.595,1262645000.0,
301,China,CHN,"MULTIPOLYGON (((109.47521 18.19770, 108.65521 ...",2001,69.981,1271850000.0,0.005546
468,China,CHN,"MULTIPOLYGON (((109.47521 18.19770, 108.65521 ...",2002,70.351,1280400000.0,0.005287
635,China,CHN,"MULTIPOLYGON (((109.47521 18.19770, 108.65521 ...",2003,70.697,1288400000.0,0.004918
802,China,CHN,"MULTIPOLYGON (((109.47521 18.19770, 108.65521 ...",2004,71.013,1296075000.0,0.00447


In [194]:
fig = px.bar(df_China, x='Percentage Change', y='Year',
             hover_data=['Population', 'Life Expectancy'], color='Life Expectancy',
             labels={'Percentage Change':'Percentage Change in Life Expectancy'}, height=400)
fig.show()

## Task 1.2.5: 

In [195]:
# Create a new dataframe containing the average life expectancy for each country between 2000 and 2010

df_Countries = final_data[['Country Name', 'Year', 'Geometry', 'Population', 'Life Expectancy']].groupby('Country Name',
                as_index=False).mean()
df_Countries.head()

Unnamed: 0,Country Name,Population,Life Expectancy
0,Afghanistan,25346330.0,57.090071
1,Albania,3004509.0,72.697929
2,Algeria,33268140.0,71.694
3,Angola,19578290.0,48.144429
4,Argentina,38878950.0,70.999929


In [227]:
# Plot a scatter plot of the average life expectancy for each country between 2000 and 2010 
# (taking the average for each country over the years)

fig = px.scatter(final_data, x='Population', y='Life Expectancy', color='Country Name', animation_frame='Year',)
fig.update_layout(title='Scatter Plot of Population VS Life Expectancy ranging from year 2000 to 2010',
                  template="plotly_dark")
fig.show()