In [1]:
# Clean the 'life_expectancy.csv' and export it for later use in code.
#
# Note: the filepath will be: '../data/clean_life_expectancy.csv'

In [2]:
# Dependencies.
import pandas as pd


In [3]:
# Declare a path to the file.
filepath = 'life_expectancy.csv'

# Load file into a DataFrame.
df = pd.read_csv(filepath)

# Preview the DataFrame.
display(df.shape)
display(df.info())
display(df.head())

(1365, 19)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1365 entries, 0 to 1364
Data columns (total 19 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Country                           1365 non-null   object 
 1   Year                              1365 non-null   int64  
 2   Life expectancy                   1365 non-null   float64
 3   Polio incidence                   1365 non-null   int64  
 4   Tuberculosis deaths               1365 non-null   float64
 5   Tuberculosis incidence            1365 non-null   float64
 6   Malaria deaths                    1365 non-null   int64  
 7   Malaria incidence                 1365 non-null   float64
 8   Alcohol deaths                    1365 non-null   float64
 9   Smoking deaths                    1365 non-null   float64
 10  Obesity deaths                    1365 non-null   float64
 11  Cardiovascular disease incidence  1365 non-null   float64
 12  Cardio

None

Unnamed: 0,Country,Year,Life expectancy,Polio incidence,Tuberculosis deaths,Tuberculosis incidence,Malaria deaths,Malaria incidence,Alcohol deaths,Smoking deaths,Obesity deaths,Cardiovascular disease incidence,Cardiovascular disease deaths,Deaths by suicide,Mean years of schooling,Population,GDP,Gov health expenditure,Undernourishment
0,Afghanistan,2002,56.4538,10,57.0,189.0,2883,79.38579,0.88,4.484723,109.050644,1.181678,761.305,7.85,1.522516,21000258,1280.4631,1.21,44.1
1,Afghanistan,2003,57.3445,8,58.0,189.0,2188,68.09156,0.86,4.593383,110.69831,1.178583,725.36395,7.72,1.579871,22645136,1292.3335,5.46,39.0
2,Afghanistan,2004,57.9436,4,52.0,189.0,773,37.616467,0.85,4.665421,113.26059,1.174824,714.998,7.77,1.637226,23553554,1260.0605,3.6,36.3
3,Afghanistan,2005,58.3608,9,47.0,189.0,545,27.079071,0.83,4.708809,115.482414,1.171161,689.6265,7.64,1.694581,24411196,1352.3207,3.37,34.5
4,Afghanistan,2006,58.6844,31,43.0,189.0,414,20.526075,0.81,4.746228,119.30124,1.16376,682.08246,7.56,1.776703,25442946,1366.9932,2.72,31.9


In [4]:
# THOUGHTS:
#
# Column names are bad. We should rename those.
#
# No null values: nothing needs to be dropped or changed with regards to that.
#
# Data types are looking good for all columns.
#
# We are going to drop the columns that we do not plan on using:
# Polio incidence, Tuberculosis deaths, Tuberculosis incidence, Malaria deaths, Malaria incidence, 
# Alcohol deaths, Cardiovascular disease incidence, Cardiovascular disease deaths, Undernourishment
#

# Grab the columns that we want and place them in a new DataFrame.
clean_df = df[['Country', 'Year', 'Life expectancy', 'Smoking deaths',
       'Obesity deaths', 'Deaths by suicide',
       'Mean years of schooling', 'Population', 'GDP',
       'Gov health expenditure']].copy()

# Rename the columns to be more usable later in the project.
clean_df.columns = [x.lower().replace(' ', '_') for x in clean_df.columns]

# Preview DataFrame.
display(clean_df.shape)
display(clean_df.info())
display(clean_df.head())

(1365, 10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1365 entries, 0 to 1364
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   country                  1365 non-null   object 
 1   year                     1365 non-null   int64  
 2   life_expectancy          1365 non-null   float64
 3   smoking_deaths           1365 non-null   float64
 4   obesity_deaths           1365 non-null   float64
 5   deaths_by_suicide        1365 non-null   float64
 6   mean_years_of_schooling  1365 non-null   float64
 7   population               1365 non-null   int64  
 8   gdp                      1365 non-null   float64
 9   gov_health_expenditure   1365 non-null   float64
dtypes: float64(7), int64(2), object(1)
memory usage: 106.8+ KB


None

Unnamed: 0,country,year,life_expectancy,smoking_deaths,obesity_deaths,deaths_by_suicide,mean_years_of_schooling,population,gdp,gov_health_expenditure
0,Afghanistan,2002,56.4538,4.484723,109.050644,7.85,1.522516,21000258,1280.4631,1.21
1,Afghanistan,2003,57.3445,4.593383,110.69831,7.72,1.579871,22645136,1292.3335,5.46
2,Afghanistan,2004,57.9436,4.665421,113.26059,7.77,1.637226,23553554,1260.0605,3.6
3,Afghanistan,2005,58.3608,4.708809,115.482414,7.64,1.694581,24411196,1352.3207,3.37
4,Afghanistan,2006,58.6844,4.746228,119.30124,7.56,1.776703,25442946,1366.9932,2.72


In [5]:
# Export the clean DataFrame as a CSV to be read in later.
clean_df.to_csv('clean_life_expectancy.csv')