# Life Expectancy by Sex Team 1

By: Megan Alicea, Chrisoula Manguravdos, Auzma Minhas, Ashley Rodriguez, Chrisoula Manguravdos, Gyalbu Sherpa

## Import Libraries

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import csv

## Load U.S. Life Expectancy Data set

In [None]:
# Load the U.S. Life Expectancy dataset
USLife_by_Sex_df = pd.read_csv("/content/U.S._State_Life_Expectancy_by_Sex__2020.csv", encoding = "ISO-8859-1")

Previewing Dataset

In [None]:
# Preview the U.S. Life Expectancy df
USLife_by_Sex_df.head()

Unnamed: 0,State,Sex,LE,SE,Quartile
0,Alabama,Total,73.2,0.067,71.9 - 75.3
1,Alaska,Total,76.6,0.176,75.4 - 76.8
2,Arizona,Total,76.3,0.055,75.4 - 76.8
3,Arkansas,Total,73.8,0.086,71.9 - 75.3
4,California,Total,79.0,0.022,78.1 - 80.7


In [None]:
# Assess the shape of the df
USLife_by_Sex_df.shape

(156, 5)

In [None]:
# Check the datatype for each column
USLife_by_Sex_df.dtypes

State        object
Sex          object
LE          float64
SE          float64
Quartile     object
dtype: object

## Cleaning Dataset

In [None]:
# Removing SE and Quartile Columns
USLife_by_Sex_df = USLife_by_Sex_df.drop(columns=USLife_by_Sex_df.columns[3:5])
USLife_by_Sex_df.head()

Unnamed: 0,State,Sex,LE
0,Alabama,Total,73.2
1,Alaska,Total,76.6
2,Arizona,Total,76.3
3,Arkansas,Total,73.8
4,California,Total,79.0


In [None]:
# Removing the first 52 rows with 'Total' values.
n=52
USLife_by_Sex_df.drop(index=USLife_by_Sex_df.index[:n], inplace=True)
print(USLife_by_Sex_df)

             State     Sex    LE
52         Alabama    Male  70.1
53          Alaska    Male  74.3
54         Arizona    Male  73.4
55        Arkansas    Male  71.1
56      California    Male  76.2
..             ...     ...   ...
151       Virginia  Female  80.1
152     Washington  Female  81.6
153  West Virginia  Female  76.1
154      Wisconsin  Female  80.3
155        Wyoming  Female  78.9

[104 rows x 3 columns]


Checking Dataset new shape

In [None]:
USLife_by_Sex_df.shape

(104, 3)

Saving the cleaned Dataset

In [None]:
file_path = 'cleaned_U.S._State_Life_Expectancy_by_Sex__2020.csv'
USLife_by_Sex_df.to_csv(file_path, index=False)

## Loading the Global Life Expectancy Dataset

In [None]:
# Load the Global Life Expectancy dataset
GlobalLife_by_Sex_df = pd.read_csv("life-expectation-at-birth-by-sex.csv", encoding = "ISO-8859-1")

Previewing the Dataset

In [None]:
# Preview the Global Life Expectancy df
GlobalLife_by_Sex_df.head()

Unnamed: 0,Entity,Code,Year,Life expectancy (period) - Sex: female - Age: 0,Life expectancy (period) - Sex: male - Age: 0
0,Afghanistan,AFG,1950,28.3905,27.1286
1,Afghanistan,AFG,1951,28.6345,27.3639
2,Afghanistan,AFG,1952,29.126,27.8377
3,Afghanistan,AFG,1953,29.6212,28.3146
4,Afghanistan,AFG,1954,29.9004,28.6239


In [None]:
# Assess the shape of the df
GlobalLife_by_Sex_df.shape

(19922, 5)

In [None]:
# Check the datatype for each column
GlobalLife_by_Sex_df.dtypes

Entity                                              object
Code                                                object
Year                                                 int64
Life expectancy (period) - Sex: female - Age: 0    float64
Life expectancy (period) - Sex: male - Age: 0      float64
dtype: object

## Cleaning the Dataset

In [None]:
# Removing Years before 1940
GlobalLife_by_Sex_df = GlobalLife_by_Sex_df.drop(GlobalLife_by_Sex_df[GlobalLife_by_Sex_df['Year'] < 1940].index)

Checking Shape

In [None]:
GlobalLife_by_Sex_df.shape

(18633, 5)

In [None]:
# Removing Code Column
GlobalLife_by_Sex_df = GlobalLife_by_Sex_df.drop(columns=GlobalLife_by_Sex_df.columns[1])
GlobalLife_by_Sex_df.head()

Unnamed: 0,Entity,Year,Life expectancy (period) - Sex: female - Age: 0,Life expectancy (period) - Sex: male - Age: 0
0,Afghanistan,1950,28.3905,27.1286
1,Afghanistan,1951,28.6345,27.3639
2,Afghanistan,1952,29.126,27.8377
3,Afghanistan,1953,29.6212,28.3146
4,Afghanistan,1954,29.9004,28.6239


In [None]:
# Changing Column Titles
new_columns = {'Entity' : 'Country',
               'Life expectancy (period) - Sex: female - Age: 0' : 'Life Expectancy of Females',
               'Life expectancy (period) - Sex: male - Age: 0' : 'Life Expectancy for Males'}
GlobalLife_by_Sex_df.rename(columns = new_columns, inplace=True)
GlobalLife_by_Sex_df.head()

Unnamed: 0,Country,Year,Life Expectancy of Females,Life Expectancy for Males
0,Afghanistan,1950,28.3905,27.1286
1,Afghanistan,1951,28.6345,27.3639
2,Afghanistan,1952,29.126,27.8377
3,Afghanistan,1953,29.6212,28.3146
4,Afghanistan,1954,29.9004,28.6239


In [None]:
# Removing continents
GlobalLife_by_Sex_df = GlobalLife_by_Sex_df.drop(GlobalLife_by_Sex_df[GlobalLife_by_Sex_df['Country'] == 'Africa'].index)
GlobalLife_by_Sex_df = GlobalLife_by_Sex_df.drop(GlobalLife_by_Sex_df[GlobalLife_by_Sex_df['Country'] == 'Asia'].index)
GlobalLife_by_Sex_df = GlobalLife_by_Sex_df.drop(GlobalLife_by_Sex_df[GlobalLife_by_Sex_df['Country'] == 'Northern America'].index)
GlobalLife_by_Sex_df = GlobalLife_by_Sex_df.drop(GlobalLife_by_Sex_df[GlobalLife_by_Sex_df['Country'] == 'South America'].index)
GlobalLife_by_Sex_df = GlobalLife_by_Sex_df.drop(GlobalLife_by_Sex_df[GlobalLife_by_Sex_df['Country'] == 'Europe'].index)

Checking shape

In [None]:
GlobalLife_by_Sex_df.shape

(18345, 4)

Saving Cleaned Dataset

In [None]:
file_path = 'cleaned_Country_Life_by_Sex.csv'
GlobalLife_by_Sex_df.to_csv(file_path, index=False)

## Load Dataset

### Creating new dataset from the Global Dataset

In [None]:
# Load the Global Life Expectancy dataset
GlobalLife_by_Sex_df = pd.read_csv("life-expectation-at-birth-by-sex.csv", encoding = "ISO-8859-1")

In [None]:
# Assess the shape of the df
GlobalLife_by_Sex_df.shape

(19922, 5)

## Cleaning Data

In [None]:
# Removing Code Column
GlobalLife_by_Sex_df = GlobalLife_by_Sex_df.drop(columns=GlobalLife_by_Sex_df.columns[1])
GlobalLife_by_Sex_df.head()

Unnamed: 0,Entity,Year,Life expectancy (period) - Sex: female - Age: 0,Life expectancy (period) - Sex: male - Age: 0
0,Afghanistan,1950,28.3905,27.1286
1,Afghanistan,1951,28.6345,27.3639
2,Afghanistan,1952,29.126,27.8377
3,Afghanistan,1953,29.6212,28.3146
4,Afghanistan,1954,29.9004,28.6239


In [None]:
# Changing Column Titles
new_columns = {'Entity' : 'Continent',
               'Life expectancy (period) - Sex: female - Age: 0' : 'Life Expectancy of Females',
               'Life expectancy (period) - Sex: male - Age: 0' : 'Life Expectancy for Males'}
GlobalLife_by_Sex_df.rename(columns = new_columns, inplace=True)
GlobalLife_by_Sex_df.head()

Unnamed: 0,Continent,Year,Life Expectancy of Females,Life Expectancy for Males
0,Afghanistan,1950,28.3905,27.1286
1,Afghanistan,1951,28.6345,27.3639
2,Afghanistan,1952,29.126,27.8377
3,Afghanistan,1953,29.6212,28.3146
4,Afghanistan,1954,29.9004,28.6239


Removing Countries from the Dataset and keeping the Continents.

In [None]:
GlobalLife_by_Sex_df = GlobalLife_by_Sex_df.query("Continent == 'Africa' | Continent == 'Asia' | Continent == 'Northern America' | Continent == 'South America' | Continent == 'Europe' | Continent == 'Australia' ")

Checking the new shape

In [None]:
GlobalLife_by_Sex_df.shape

(389, 4)

Saving Cleaned Dataset

In [None]:
file_path = 'cleaned_Continents_Life_by_Sex.csv'
GlobalLife_by_Sex_df.to_csv(file_path, index=False)

## Altair Visuals

In [None]:
import altair as alt
from vega_datasets import data

In [None]:
continents = pd.read_csv("/content/cleaned_Continents_Life_by_Sex.csv", encoding = "ISO-8859-1")

In [None]:
alt.Chart(continents).mark_point().encode(
    x='Life Expectancy of Females',
    y='Year',
    shape='Continent',
    color='Continent'
).interactive()

In [None]:
alt.Chart(continents).mark_point().encode(
    x='Life Expectancy for Males',
    y='Year',
    shape='Continent',
    color='Continent'
).interactive()