# Life expectancy and urban population

In [None]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt

The first part is the same as the example where I explore the data. 

In [None]:
data = pd.read_csv('./Indikatorer/Indicators.csv')
data.shape

In [None]:
data.tail()

I have to find the right indicators, and I'll make a list of countries

In [None]:
# Indicators and countries
countries = data['CountryName'].unique().tolist()
indicators = data['IndicatorName'].unique().tolist()


This is a very long list and I prefer to look at the indicators in an excel format

In [None]:
df = pd.DataFrame(indicators)
writer = pd.ExcelWriter('indicators.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='welcome', index=False)
writer.save()

Life expectancy at birth and Rural population seems to fit the bill. Using the setup from the course, I'll be looking into the figures for USA

In [None]:
Life=data['IndicatorName'].str.contains('Life expectancy at birth, total')
Rural=data['IndicatorCode'].str.contains('SP.RUR.TOTL.ZS')
country=data['CountryCode'].str.contains('USA')

In [None]:
stage = data[Rural & country]
stage2= data[Life & country]
stage.head()

In [None]:
stage.shape

In [None]:
stage2.shape

In [None]:

# get the years
years = stage['Year'].values
# get the values 
countryside = stage['Value'].values

fig, axis = plt.subplots()
# create histogram of % of rural population
axis.set_title('Rural population, % of total population, USA',fontsize=12)
axis.set_xlabel('Year',fontsize=10)
axis.set_ylabel(stage['IndicatorName'].iloc[0],fontsize=10)

plt.bar(years,countryside)
plt.show()

In [None]:
# create histogram of expected lifespan 
years = stage2['Year'].values
longlife = stage2['Value'].values

fig, axis = plt.subplots()
axis.set_title('Life expectancy at birth, USA',fontsize=12)
axis.set_xlabel('Year',fontsize=10)
axis.set_ylabel(stage2['IndicatorName'].iloc[0],fontsize=10)
plt.bar(years,longlife)
plt.show()

So looking at the histograms for USA we immediately see that the development in column-sizes are oppoite, and we should expect a negative correlation. 

I'll set up a scatterplot of life expectancy and rate of rural living

In [None]:
# first set similar timeframe. 
print("rural Min Year = ", stage['Year'].min(), "max: ", stage['Year'].max())
print("life Min Year = ", stage2['Year'].min(), "max: ", stage2['Year'].max())


In [None]:
# Which means that I need to stop at 2013:
stage_trunc = stage[stage['Year'] < 2014]
print(len(stage_trunc))
print(len(stage2))

Now I'm ready to draw a scatterplot:

In [None]:
# %matplotlib inline
# import matplotlib.pyplot as plt

fig, axis = plt.subplots()
# Grid lines, Xticks, Xlabel, Ylabel

axis.yaxis.grid(True)
axis.set_title('Life expectancy vs share of rural population',fontsize=15)
axis.set_xlabel(stage_trunc['IndicatorName'].iloc[0],fontsize=10)
axis.set_ylabel(stage2['IndicatorName'].iloc[0],fontsize=10)

X = stage_trunc['Value']
Y = stage2['Value']

# Adjusting Y-axis
plt.ylim(0,100)

# Adjusting x-axis
plt.xlim(0,100)

axis.scatter(X, Y)
plt.show()

I think it's easier just to use the automatically generated axises

In [None]:
# %matplotlib inline
# import matplotlib.pyplot as plt

fig, axis = plt.subplots()
# Grid lines, Xticks, Xlabel, Ylabel

axis.yaxis.grid(True)
axis.set_title('Life expectancy vs share of rural population',fontsize=15)
axis.set_xlabel(stage_trunc['IndicatorName'].iloc[0],fontsize=10)
axis.set_ylabel(stage2['IndicatorName'].iloc[0],fontsize=10)

XUSA = stage_trunc['Value']
YUSA = stage2['Value']

# Adjusting Y-axis
# plt.ylim(50,100)

# Adjusting x-axis
# plt.xlim(10,60)

axis.scatter(XUSA, YUSA)
plt.show()

In [None]:
# correlation
np.corrcoef(stage_trunc['Value'],stage2['Value'])

The data from USA shows a very strong negative correlation between life expectancy and share of rural population, and does not support the thesis. 
Of course, lack of correlation is not evidence. Let's try with another country. I used a random country finder from the dataset, and Mexico turned up. 

In [None]:
countryFilter = random.sample(data['CountryName'].unique().tolist(),1)
countryFilter1 = countryFilter[0]
countryFilter1

It's not really practical to work with randomly generated datasets based on random countries when you are making a presentation, so it's easier to fix the country. That way you can reproduce the graphs. 

In order to replace the scatterplots with the randomly generated countries, you have to replace the countryMexico variable with the variable countryRandom given by the code below:

In [None]:
countryMexico=data['CountryName']=='Mexico'

# countryRandom=data['CountryName']==countryFilter1
# countryRandom

In [None]:
# Scatterplot
country2stage = data[Rural & countryMexico]
country2stage2= data[Life & countryMexico]

# To make the graph based on the random country, replace with these lines of codes and run the following cells

# country2stage = data[Rural & countryRandom]
# country2stage2= data[Life & countryRandom]

In [None]:
# first set similar timeframe. 
print("rural Min Year = ", country2stage['Year'].min(), "max: ", country2stage['Year'].max())
print("life Min Year = ", country2stage2['Year'].min(), "max: ", country2stage2['Year'].max())


In [None]:
# Which means that I need to stop at 2013:
country2stage = country2stage[country2stage['Year'] < 2014]
print(len(country2stage))
print(len(country2stage2))

In [None]:
# Here's the scatterplot for Mexico:

fig, axis = plt.subplots()
# Grid lines, Xticks, Xlabel, Ylabel

axis.yaxis.grid(True)
axis.set_title('Life expectancy vs share of rural population for '+countryFilter1,fontsize=10)
axis.set_xlabel(stage_trunc['IndicatorName'].iloc[0],fontsize=10)
axis.set_ylabel(stage2['IndicatorName'].iloc[0],fontsize=10)

X = country2stage['Value']
Y = country2stage2['Value']

# Adjusting Y-axis
# plt.ylim(0,10+max(Y))

# Adjusting x-axis
# plt.xlim(20,80)

axis.scatter(X, Y, color="orange")
plt.show()

In [None]:
# and the Correlation for Mexico:
# correlation
np.corrcoef(country2stage['Value'],country2stage2['Value'])

For Mexico the the data shows almost a linear negative relationship between the two indicators. I fact the correlation is almost -1, which would be linear. 

Now to plot the two figures together: 

In [None]:
fig, axis = plt.subplots()

plt.scatter(XUSA,YUSA, color='blue', label='USA')
plt.scatter(X, Y, color="orange" ,label='Mexico')
plt.legend(loc='upper right')

axis.yaxis.grid(True)
axis.set_title('Life expectancy vs share of rural population for USA and Mexico',fontsize=10)
axis.set_xlabel(stage_trunc['IndicatorName'].iloc[0],fontsize=10)
axis.set_ylabel(stage2['IndicatorName'].iloc[0],fontsize=10)
plt.show()

Now I also want to test my question across countries in a given year

First I'll check for missing data, and choose only the countries that have data for both indicators in 2011

In [None]:
# select countries in 2011
indicatorRural = 'SP.RUR.TOTL.ZS'
indicatorLife='SP.DYN.LE00.IN'
year = 2011

mask1 = data['IndicatorCode']==indicatorRural 
mask2 = data['IndicatorCode']==indicatorLife
mask3 = data['Year'].isin([year])

# apply our mask to get two datasets
Rural2011 = data[mask1 & mask3]
Life2011 = data[mask2 & mask3]
Rural2011

In [None]:
print(len(Rural2011))
print(len(Life2011))
# print(len(RuralLife))

There's not the same number of observations so I would need to trim the observations to countries where we have observations both of rural populace and life expectancy. I have to filter the countries

In [None]:
countries1 = Rural2011['CountryName'].unique().tolist()
countries2 = Life2011['CountryName'].unique().tolist()
countryfilter = list(set(countries1).intersection(countries2))
len(countryfilter)
# countryfilter

In [None]:
# Here's the two datasets that has observations for both rural populace and life expectancy, 
# where we can see that the shape of the datasets are alike:

Rural2011filterd = Rural2011.loc[Rural2011['CountryName'].isin(countryfilter)]
Life2011filterd = Life2011.loc[Life2011['CountryName'].isin(countryfilter)]
print(Rural2011filterd.shape)
print(Life2011filterd.shape)

In [None]:
# I'll merge the datasets: 

RuralLife = Rural2011filterd.merge(Life2011filterd, how='inner', on='CountryName')
RuralLife.head()

In [None]:
# We find that there's lots of unneccesary information in the dataframe, so I'll drop many of the Columns
RuralLife.drop(index=1,columns=['CountryCode_x','IndicatorCode_x','IndicatorCode_y', 'CountryCode_y','Year_y'], inplace=True)
RuralLife.head()

In [None]:
# And rename som columns to make it more easy to read
RuralLife.rename(columns={"Value_x": "ValueRural", "Value_y": "ValueLife"},inplace=True)

In [None]:
# First I'll check the correlation:
RuralLife.corr(method='pearson')

-0,65 is also a clear negative correlation. 

Time to look at a scatterplot of the values:

In [None]:
# And finally I can set up a scatter-plot of the observations
fig, axis = plt.subplots()

axis.yaxis.grid(True)
axis.set_title('Life expectancy and rate of rural population 2011',fontsize=12)
axis.set_xlabel(stage_trunc['IndicatorName'].iloc[0],fontsize=10)
axis.set_ylabel(stage2['IndicatorName'].iloc[0],fontsize=10)

X = RuralLife['ValueRural']
Y = RuralLife['ValueLife']
plt.ylim(30,90)

axis.scatter(X, Y)
plt.show()


The scatterplot as expected shows a negative correlation, where the countries with higher rates of rural populace tends to have lower life expectancy at birth. 

### Jeg kunne gjort dette uten å merge datasettene ser jeg, men det var litt sent

In [None]:
fig, axis = plt.subplots()
# Grid lines, Xticks, Xlabel, Ylabel

axis.yaxis.grid(True)
axis.set_title('Life expectancy vs share of rural population',fontsize=15)
axis.set_xlabel(stage_trunc['IndicatorName'].iloc[0],fontsize=10)
axis.set_ylabel(stage2['IndicatorName'].iloc[0],fontsize=10)

X2011 = Rural2011filterd['Value']
Y2011 = Life2011filterd['Value']

# Adjusting Y-axis
# plt.ylim(50,100)

# Adjusting x-axis
# plt.xlim(10,60)

axis.scatter(X2011, Y2011)
plt.show()