# Life Expectancy Regression Project

## Business Context

## 1. Exploratory Data Analysis 

### 1.1 Importing the Necessary Libraries and Packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels

### 1.2 
https://worldpopulationreview.com/

In [2]:
# Importing the csv as a pandas dataframe
population_df = pd.read_csv('data/population_by_country_2019.csv')

# Examining the shape of our dataframe
print(population_df.shape)

# Displaying the first ten rows 
population_df.head(10)

(232, 18)


Unnamed: 0,cca2,name,pop2021,pop2020,pop2050,pop2030,pop2019,pop2015,pop2010,pop2000,pop1990,pop1980,pop1970,area,Density,GrowthRate,WorldPercentage,rank
0,CN,China,1444216.107,1439323.776,1402405.17,1464340.159,1433783.686,1406847.87,1368810.615,1290550.765,1176883.674,1000089.235,827601.394,9706961,148.7815,1.0034,0.1834,1
1,IN,India,1393409.038,1380004.385,1639176.033,1503642.322,1366417.754,1310152.403,1234281.17,1056575.549,873277.798,698952.844,555189.792,3287590,423.8391,1.0097,0.1769,2
2,US,United States,332915.073,331002.651,379419.102,349641.876,329064.917,320878.31,309011.475,281710.909,252120.309,229476.354,209513.341,9372610,35.52,1.0058,0.0423,3
3,ID,Indonesia,276361.783,273523.615,330904.664,299198.43,270625.568,258383.256,241834.215,211513.823,181413.402,147447.836,114793.178,1904569,145.1046,1.0104,0.0351,4
4,PK,Pakistan,225199.937,220892.34,338013.196,262958.794,216565.318,199426.964,179424.641,142343.578,107647.921,78054.343,58142.06,881912,255.3542,1.0195,0.0286,5
5,BR,Brazil,213993.437,212559.417,228980.4,223852.122,211049.527,204471.769,195713.635,174790.34,149003.223,120694.009,95113.265,8515767,25.1291,1.0067,0.0272,6
6,NG,Nigeria,211400.708,206139.589,401315.0,262977.337,200963.599,181137.448,158503.197,122283.85,95212.45,73423.633,55982.144,923768,228.8461,1.0255,0.0268,7
7,BD,Bangladesh,166303.498,164689.383,192567.778,178993.869,163046.161,156256.276,147575.43,127657.854,103171.956,79639.491,64232.482,147570,1126.9465,1.0098,0.0211,8
8,RU,Russia,145912.025,145934.462,135824.481,143347.515,145872.256,144985.057,143479.274,146404.903,147531.561,138053.15,130148.653,17098242,8.5337,0.9998,0.0185,9
9,MX,Mexico,130262.216,128932.753,155150.818,140875.762,127575.529,121858.258,114092.963,98899.845,83943.132,67761.372,51493.565,1964375,66.3123,1.0103,0.0165,10


In [3]:
# Creating a new dataframe with only the columns of interest
clean_population_df = population_df[['name', 'pop2019', 'Density', 'GrowthRate']]

# Sorting the new dataframe by Country Name 
sorted_population_df = clean_population_df.sort_values(['name'])

# Displaying the length of our new dataframe
print(f'The population csv lists {len(sorted_population_df)} regions')

# Examining the first five rows
sorted_population_df.head()

The population csv lists 232 regions


Unnamed: 0,name,pop2019,Density,GrowthRate
36,Afghanistan,38041.754,61.0757,1.0233
139,Albania,2880.917,99.9351,0.9983
33,Algeria,43053.054,18.7328,1.0175
208,American Samoa,55.312,276.8844,0.9984
201,Andorra,77.142,165.2885,1.0012


## 2.3

https://worldpopulationreview.com/country-rankings/developed-countries

This CSV somewhat strangely includes the development index figures from 2019, but the population data from 2021. This is fine as we will only be using the development index numbers. 

In [4]:
# Importing the second csv as a dataframe
developing_status = pd.read_csv('data/development_index.csv')

# Displaying the number of countries tracked in this csv
print(f'The development_status dataframe lists {len(developing_status)} countries')

# Examining the first ten rows 
developing_status.head(10)

The development_status dataframe lists 186 countries


Unnamed: 0,country,hdi2019,pop2021
0,Norway,0.957,5465.63
1,Ireland,0.955,4982.907
2,Switzerland,0.955,8715.494
3,Iceland,0.949,343.353
4,Hong Kong,0.949,7552.81
5,Germany,0.947,83900.473
6,Sweden,0.945,10160.169
7,Netherlands,0.944,17173.099
8,Australia,0.944,25788.215
9,Denmark,0.94,5813.298


In [5]:
# Making a list of the countries present in the population dataframe
pop_countries = sorted([i for i in sorted_population_df['name']])
print(pop_countries)

# Making a list of the countries in the development dataframe
dev_countries = sorted([i for i in developing_status['country']])
print('\n', dev_countries)

['Afghanistan', 'Albania', 'Algeria', 'American Samoa', 'Andorra', 'Angola', 'Anguilla', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'British Virgin Islands', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Cape Verde', 'Cayman Islands', 'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Cook Islands', 'Costa Rica', 'Croatia', 'Cuba', 'Curacao', 'Cyprus', 'Czech Republic', 'DR Congo', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Eswatini', 'Ethiopia', 'Falkland Islands', 'Faroe Islands', 'Fiji', 'Finland', 'France', 'French Guiana', 'French Polynesia', 'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Gibraltar', 'Gree

In [6]:
# Checking which regions are missing from the population data
count = 0
for i in dev_countries:
    if i not in pop_countries:
        count += 1
        print(i)
        
print(f'''There are {count} countries/regions present in the development dataframe not present in 
      the population dataframe''')

There are 0 countries/regions present in the development dataframe not present in 
      the population dataframe


In [7]:
# Checking which regions are missing from the development data
count = 0     
for i in pop_countries:
    if i not in dev_countries:
        count += 1
        print(i)
        
print(f'''There are {count} countries/regions present in the population dataframe not present in 
      the development dataframe''')

American Samoa
Anguilla
Aruba
Bermuda
British Virgin Islands
Brunei
Cape Verde
Cayman Islands
Cook Islands
Curacao
Czech Republic
Falkland Islands
Faroe Islands
French Guiana
French Polynesia
Gibraltar
Greenland
Guadeloupe
Guam
Isle of Man
Macau
Martinique
Mayotte
Monaco
Montserrat
Nauru
New Caledonia
Niue
North Korea
Northern Mariana Islands
Puerto Rico
Reunion
Saint Barthelemy
Saint Martin
Saint Pierre and Miquelon
San Marino
Sint Maarten
Somalia
Taiwan
Tokelau
Turks and Caicos Islands
Tuvalu
United States Virgin Islands
Vatican City
Wallis and Futuna
Western Sahara
There are 46 countries/regions present in the population dataframe not present in 
      the development dataframe


In [47]:
# Creating a main dataframe by merging our two current dataframes
main_df = developing_status.merge(population_df, how='inner', left_on='country', right_on='name', sort=True)

# Selecting only the columns we want 
main_df = main_df[['country', 'cca2', 'pop2019', 'Density', 'GrowthRate', 'hdi2019', ]]

main_df.rename(columns={'country':'Country', 'cca2':'Code', 'pop2019':'Population 2019', 'GrowthRate':'Growth Rate', 'hdi2019':'HDI 2019'}, inplace=True)

main_df

Unnamed: 0,Country,Code,Population 2019,Density,Growth Rate,HDI 2019
0,Afghanistan,AF,38041.754,61.0757,1.0233,0.511
1,Albania,AL,2880.917,99.9351,0.9983,0.795
2,Algeria,DZ,43053.054,18.7328,1.0175,0.748
3,Andorra,AD,77.142,165.2885,1.0012,0.868
4,Angola,AO,31825.295,27.2187,1.0325,0.581
...,...,...,...,...,...,...
181,Venezuela,VE,28515.829,31.3221,1.0095,0.711
182,Vietnam,VN,96462.106,296.3927,1.0085,0.704
183,Yemen,YE,29161.922,57.7509,1.0223,0.470
184,Zambia,ZM,17861.030,25.1400,1.0292,0.584
