# Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#required libraries to get an overview of dataset with Python

In [2]:
def extract(filename):
    file = "/files/Macroeconometrics/Temperature/dat/" + filename  # identify location
    text = open(file, 'r')  #open text file
    
    table=[]  #create an empty table where relevant data will be inserted
    for line in text:
        column = line.split()   #split text file into words using the in-built split function
        if line[0] == '#':   #ignore comments
            continue
        elif len(column) == 0: #ignore empty lines
            continue
        elif line[0] == 'S':  #Ignore first line of string (description of dataset. Line starts with the word "Switzerland")
            continue
        elif line[2] == 'Y': #Ignore column names (First column is "Year")
            continue
        else:
            year = int(column[0])  #define columns and name them
            age = column[1]  
            female = column[2]
            male = column[3]
            total = column[4]
            table.append([year, age, female, male, total])  #insert variables into table
    text.close()
    return(table)

# Mortality (absoulte)

In [3]:
x = extract("Deaths_5x1.txt")   #Turns .txt file into a list that we can then turn into pandas dataframe

In [4]:
death = pd.DataFrame(x, columns=['Year', 'Age', 'Female', 'Male', 'Total'])

In [5]:
death = death.rename(columns={"Female": "Fem_death", "Male": "Male_death", "Total":"Total_death"})

In [6]:
death

Unnamed: 0,Year,Age,Fem_death,Male_death,Total_death
0,1876,0,7982.00,10059.00,18041.00
1,1876,1-4,2566.00,2562.00,5128.00
2,1876,5-9,787.00,839.00,1626.00
3,1876,10-14,560.00,477.00,1037.00
4,1876,15-19,754.00,731.00,1485.00
...,...,...,...,...,...
3523,2022,90-94,8305.00,4836.00,13141.00
3524,2022,95-99,4148.00,1602.00,5750.00
3525,2022,100-104,764.00,199.00,963.00
3526,2022,105-109,50.00,11.00,61.00


# Population

In [23]:
y = extract("Population5.txt")

In [24]:
pop = pd.DataFrame(y, columns=['Year', 'Age', 'Female', 'Male', 'Total'])

In [9]:
pop.drop(pop.tail(24).index,inplace=True)    #Data goes up to 2023 while other datasets go up to 2022, so we drop the last year from the data

In [10]:
pop = pop.drop(["Age","Year"], axis=1)       #These column are already present in the absolute death dataframe. Since we will join these two tables together, we don't need these columns

In [11]:
pop = pop.rename(columns={"Female": "Fem_pop", "Male": "Male_pop", "Total":"Total_pop"})      #Renaming columns

In [12]:
pop

Unnamed: 0,Fem_pop,Male_pop,Total_pop
0,33781.00,34056.00,67837.00
1,129479.00,131343.00,260822.00
2,143808.00,141747.00,285555.00
3,137624.00,135164.00,272788.00
4,125369.00,121460.00,246829.00
...,...,...,...
3523,46441.25,21421.94,67863.19
3524,13034.00,4232.88,17266.88
3525,1544.50,357.02,1901.52
3526,71.73,14.50,86.23


# Mortality Rate

*You will notice some columns will start with capital letters and others in lower case. I did this to differentiate between string values and float values. In the original dataset, most values were stored as strings since we imported a .txt file. For these values I named the columns with capital letters. However, after converting the datatype to float values I also turned the columns into lower case to differentiate the two.*

In [13]:
df_merged = pd.concat([death, pop], ignore_index=False, sort=False, axis=1)

In [14]:
df_merged

Unnamed: 0,Year,Age,Fem_death,Male_death,Total_death,Fem_pop,Male_pop,Total_pop
0,1876,0,7982.00,10059.00,18041.00,33781.00,34056.00,67837.00
1,1876,1-4,2566.00,2562.00,5128.00,129479.00,131343.00,260822.00
2,1876,5-9,787.00,839.00,1626.00,143808.00,141747.00,285555.00
3,1876,10-14,560.00,477.00,1037.00,137624.00,135164.00,272788.00
4,1876,15-19,754.00,731.00,1485.00,125369.00,121460.00,246829.00
...,...,...,...,...,...,...,...,...
3523,2022,90-94,8305.00,4836.00,13141.00,46441.25,21421.94,67863.19
3524,2022,95-99,4148.00,1602.00,5750.00,13034.00,4232.88,17266.88
3525,2022,100-104,764.00,199.00,963.00,1544.50,357.02,1901.52
3526,2022,105-109,50.00,11.00,61.00,71.73,14.50,86.23


In [15]:
df_merged['total_pop'] = df_merged['Total_pop'].astype(float)
df_merged['fem_pop'] = df_merged['Fem_pop'].astype(float)
df_merged['male_pop'] = df_merged['Male_pop'].astype(float)
df_merged['total_death'] = df_merged['Total_death'].astype(float)
df_merged['fem_death'] = df_merged['Fem_death'].astype(float)
df_merged['male_death'] = df_merged['Male_death'].astype(float)

In [16]:
df_merged

Unnamed: 0,Year,Age,Fem_death,Male_death,Total_death,Fem_pop,Male_pop,Total_pop,total_pop,fem_pop,male_pop,total_death,fem_death,male_death
0,1876,0,7982.00,10059.00,18041.00,33781.00,34056.00,67837.00,67837.00,33781.00,34056.00,18041.0,7982.0,10059.0
1,1876,1-4,2566.00,2562.00,5128.00,129479.00,131343.00,260822.00,260822.00,129479.00,131343.00,5128.0,2566.0,2562.0
2,1876,5-9,787.00,839.00,1626.00,143808.00,141747.00,285555.00,285555.00,143808.00,141747.00,1626.0,787.0,839.0
3,1876,10-14,560.00,477.00,1037.00,137624.00,135164.00,272788.00,272788.00,137624.00,135164.00,1037.0,560.0,477.0
4,1876,15-19,754.00,731.00,1485.00,125369.00,121460.00,246829.00,246829.00,125369.00,121460.00,1485.0,754.0,731.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3523,2022,90-94,8305.00,4836.00,13141.00,46441.25,21421.94,67863.19,67863.19,46441.25,21421.94,13141.0,8305.0,4836.0
3524,2022,95-99,4148.00,1602.00,5750.00,13034.00,4232.88,17266.88,17266.88,13034.00,4232.88,5750.0,4148.0,1602.0
3525,2022,100-104,764.00,199.00,963.00,1544.50,357.02,1901.52,1901.52,1544.50,357.02,963.0,764.0,199.0
3526,2022,105-109,50.00,11.00,61.00,71.73,14.50,86.23,86.23,71.73,14.50,61.0,50.0,11.0


In [26]:
df_merged['total_mortality'] = round(df_merged['total_death']/df_merged['total_pop'],3)       #creating columns for mortality rate. I round it to 3 decimal places
df_merged['fem_mortality'] = round(df_merged['fem_death']/df_merged['fem_pop'],3)
df_merged['male_mortality'] = round(df_merged['male_death']/df_merged['male_pop'],3)

In [18]:
df_merged = df_merged.drop(['Fem_death','Male_death','Total_death','Fem_pop','Male_pop','Total_pop'], axis=1)    #dropping columns with string datatype

In [20]:
df_merged = pd.get_dummies(df_merged, columns = ['Age'], drop_first = True)         #creates dummy variables for age groups that we can use in regression. Also drops the original age column as it is no longer necessary.

In [21]:
df_merged

Unnamed: 0,Year,total_pop,fem_pop,male_pop,total_death,fem_death,male_death,total_mortality,fem_mortality,male_mortality,...,Age_50-54,Age_55-59,Age_60-64,Age_65-69,Age_70-74,Age_75-79,Age_80-84,Age_85-89,Age_90-94,Age_95-99
0,1876,67837.00,33781.00,34056.00,18041.0,7982.0,10059.0,0.266,0.236,0.295,...,0,0,0,0,0,0,0,0,0,0
1,1876,260822.00,129479.00,131343.00,5128.0,2566.0,2562.0,0.020,0.020,0.020,...,0,0,0,0,0,0,0,0,0,0
2,1876,285555.00,143808.00,141747.00,1626.0,787.0,839.0,0.006,0.005,0.006,...,0,0,0,0,0,0,0,0,0,0
3,1876,272788.00,137624.00,135164.00,1037.0,560.0,477.0,0.004,0.004,0.004,...,0,0,0,0,0,0,0,0,0,0
4,1876,246829.00,125369.00,121460.00,1485.0,754.0,731.0,0.006,0.006,0.006,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3523,2022,67863.19,46441.25,21421.94,13141.0,8305.0,4836.0,0.194,0.179,0.226,...,0,0,0,0,0,0,0,0,1,0
3524,2022,17266.88,13034.00,4232.88,5750.0,4148.0,1602.0,0.333,0.318,0.378,...,0,0,0,0,0,0,0,0,0,1
3525,2022,1901.52,1544.50,357.02,963.0,764.0,199.0,0.506,0.495,0.557,...,0,0,0,0,0,0,0,0,0,0
3526,2022,86.23,71.73,14.50,61.0,50.0,11.0,0.707,0.697,0.759,...,0,0,0,0,0,0,0,0,0,0


In [22]:
df_merged.to_csv('Mortality.csv', sep=',', index=False, encoding='utf-8')