In [47]:
import pandas as pd

In [48]:
df = pd.read_csv('FRATNPdeath.csv')
df

Unnamed: 0,PopName,Area,Year,YearReg,YearInterval,Sex,Age,AgeInterval,Lexis,RefCode,Access,Deaths,NoteCode1,NoteCode2,NoteCode3,LDB
0,FRATNP,20,1816,1816,1,f,TOT,.,.,52.0,O,355467,.,.,.,1
1,FRATNP,20,1816,1816,1,f,UNK,.,.,52.0,O,2623,.,.,.,1
2,FRATNP,20,1816,1816,1,f,0,1,RR,52.0,O,75769,.,.,.,1
3,FRATNP,20,1816,1816,1,f,1,4,RR,52.0,O,42511,.,.,.,1
4,FRATNP,20,1816,1816,1,f,5,5,RR,52.0,O,15919,.,.,.,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59139,FRATNP,511,2020,2020,1,m,104,1,TL,130.0,O,37,.,.,.,1
59140,FRATNP,512,2020,2020,1,m,104,1,TU,130.0,O,20,.,.,.,1
59141,FRATNP,513,2020,2020,1,m,105,1,TL,130.0,O,45,.,.,.,1
59142,FRATNP,514,2020,2020,1,m,105,+,TU,130.0,O,48,.,.,.,1


In [49]:
# Check the dataset
df.dtypes

PopName          object
Area              int64
Year              int64
YearReg           int64
YearInterval      int64
Sex              object
Age              object
AgeInterval      object
Lexis            object
RefCode         float64
Access           object
Deaths            int64
NoteCode1        object
NoteCode2        object
NoteCode3        object
LDB               int64
dtype: object

In [50]:
# Transfer the data type of Age to int
df = df.astype({'Year':'int64'})
df.dtypes

PopName          object
Area              int64
Year              int64
YearReg           int64
YearInterval      int64
Sex              object
Age              object
AgeInterval      object
Lexis            object
RefCode         float64
Access           object
Deaths            int64
NoteCode1        object
NoteCode2        object
NoteCode3        object
LDB               int64
dtype: object

### 2.Treemap

In [51]:
#2.1 filter the data we need (Year 1940-2019)
df2 = df[(df['Year'] >= 1940) & (df['Year'] <= 2019)][['PopName', 'Year', 'Deaths']]
df2

Unnamed: 0,PopName,Year,Deaths
22156,FRATNP,1940,990
22157,FRATNP,1940,14584
22158,FRATNP,1940,7342
22159,FRATNP,1940,2131
22160,FRATNP,1940,1767
...,...,...,...
58713,FRATNP,2019,33
58714,FRATNP,2019,43
58715,FRATNP,2019,60
58716,FRATNP,2019,38


In [52]:
#2.2 Categorize the year into 8 groups
bins = [1940, 1949, 1959, 1969, 1979, 1989, 1999, 2009, 2019]
labels = ['1940s', '1950s', '1960s', '1970s','1980s','1990s','2000s', '2010s']
df2['YearGroup'] = pd.cut(df2['Year'],bins=bins,labels=labels, include_lowest=True)
df2

Unnamed: 0,PopName,Year,Deaths,YearGroup
22156,FRATNP,1940,990,1940s
22157,FRATNP,1940,14584,1940s
22158,FRATNP,1940,7342,1940s
22159,FRATNP,1940,2131,1940s
22160,FRATNP,1940,1767,1940s
...,...,...,...,...
58713,FRATNP,2019,33,2010s
58714,FRATNP,2019,43,2010s
58715,FRATNP,2019,60,2010s
58716,FRATNP,2019,38,2010s


In [53]:
#2.3 Group and calculate the total deaths in each group of year (Treemap)
group = df2.groupby(['YearGroup'])
df2['Total_Deaths'] = group['Deaths'].transform('sum')
df2

Unnamed: 0,PopName,Year,Deaths,YearGroup,Total_Deaths
22156,FRATNP,1940,990,1940s,6563161
22157,FRATNP,1940,14584,1940s,6563161
22158,FRATNP,1940,7342,1940s,6563161
22159,FRATNP,1940,2131,1940s,6563161
22160,FRATNP,1940,1767,1940s,6563161
...,...,...,...,...,...
58713,FRATNP,2019,33,2010s,11384622
58714,FRATNP,2019,43,2010s,11384622
58715,FRATNP,2019,60,2010s,11384622
58716,FRATNP,2019,38,2010s,11384622


In [54]:
df2[df2.isna().any(axis=1)]

Unnamed: 0,PopName,Year,Deaths,YearGroup,Total_Deaths


In [55]:
#2.4 Drop duplicates
df2 = df2.drop_duplicates('YearGroup')
df2 = df2.drop(['Year','Deaths'],axis=1)
df2 = df2.rename({'PopName':'Country', 'YearGroup':'Year'}, axis=1)
df2

Unnamed: 0,Country,Year,Total_Deaths
22156,FRATNP,1940s,6563161
26964,FRATNP,1950s,5314828
31676,FRATNP,1960s,5382588
36136,FRATNP,1970s,5500070
40516,FRATNP,1980s,5427950
44976,FRATNP,1990s,5294022
49828,FRATNP,2000s,7929273
54458,FRATNP,2010s,11384622


In [56]:
#3.1 Population dataset
df_pop = pd.read_csv('FRATNPpop.csv')
df_pop = df_pop[(df_pop['Year'] >= 1940) & (df_pop['Year'] <= 2019)][['PopName', 'Year', 'Population']]
df_pop

Unnamed: 0,PopName,Year,Population
8222,FRATNP,1940,277500
8223,FRATNP,1940,269700
8224,FRATNP,1940,272300
8225,FRATNP,1940,276300
8226,FRATNP,1940,283600
...,...,...,...
24097,FRATNP,2019,3276
24098,FRATNP,2019,1919
24099,FRATNP,2019,1362
24100,FRATNP,2019,1249


In [57]:
#3.2 Categorize the year into 8 groups
bins = [1940, 1949, 1959, 1969, 1979, 1989, 1999, 2009, 2019]
labels = ['1940s', '1950s', '1960s', '1970s','1980s','1990s','2000s', '2010s']
df_pop['YearGroup'] = pd.cut(df_pop['Year'],bins=bins,labels=labels, include_lowest=True)

#3.3 Group and calculate the total population in each group of year (Treemap)
group = df_pop.groupby(['YearGroup'])
df_pop['Total_Pop'] = group['Population'].transform('sum')
df_pop

Unnamed: 0,PopName,Year,Population,YearGroup,Total_Pop
8222,FRATNP,1940,277500,1940s,392933052
8223,FRATNP,1940,269700,1940s,392933052
8224,FRATNP,1940,272300,1940s,392933052
8225,FRATNP,1940,276300,1940s,392933052
8226,FRATNP,1940,283600,1940s,392933052
...,...,...,...,...,...
24097,FRATNP,2019,3276,2010s,640286924
24098,FRATNP,2019,1919,2010s,640286924
24099,FRATNP,2019,1362,2010s,640286924
24100,FRATNP,2019,1249,2010s,640286924


In [58]:
df2[df2.isna().any(axis=1)]

Unnamed: 0,Country,Year,Total_Deaths


In [59]:
#3.4 Drop duplicates
df_pop = df_pop.drop_duplicates('YearGroup')
df_pop = df_pop.drop(['PopName','Year','Population'],axis=1)
df_pop = df_pop.rename({'YearGroup':'Year'}, axis=1)
df_pop

Unnamed: 0,Year,Total_Pop
8222,1940s,392933052
10042,1950s,431953616
11922,1960s,480142824
13842,1970s,522436986
15812,1980s,550126128
17832,1990s,576062450
19862,2000s,607176741
21982,2010s,640286924


In [62]:
#4. merge 2 datasets
fr = pd.merge(df2, df_pop, on='Year', how='inner')
fr

Unnamed: 0,Country,Year,Total_Deaths,Total_Pop
0,FRATNP,1940s,6563161,392933052
1,FRATNP,1950s,5314828,431953616
2,FRATNP,1960s,5382588,480142824
3,FRATNP,1970s,5500070,522436986
4,FRATNP,1980s,5427950,550126128
5,FRATNP,1990s,5294022,576062450
6,FRATNP,2000s,7929273,607176741
7,FRATNP,2010s,11384622,640286924


In [63]:
fr['Death_Rate'] = fr[['Total_Deaths','Total_Pop']].apply(lambda x: x['Total_Deaths'] / x['Total_Pop']* 1000 ,axis=1)
fr

Unnamed: 0,Country,Year,Total_Deaths,Total_Pop,Death_Rate
0,FRATNP,1940s,6563161,392933052,16.703001
1,FRATNP,1950s,5314828,431953616,12.304164
2,FRATNP,1960s,5382588,480142824,11.210389
3,FRATNP,1970s,5500070,522436986,10.527719
4,FRATNP,1980s,5427950,550126128,9.866737
5,FRATNP,1990s,5294022,576062450,9.190014
6,FRATNP,2000s,7929273,607176741,13.05925
7,FRATNP,2010s,11384622,640286924,17.780501


In [64]:
fr.to_csv('D:/Users/Lenovo/Desktop/COMP4037 Research Methods/CW2_Data Visualization/Data/France_death rate.csv', index = False)