In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('RUSdeath.csv')
df

Unnamed: 0,PopName,Area,Year,YearReg,YearInterval,Sex,Age,AgeInterval,Lexis,RefCode,Access,Deaths,NoteCode1,NoteCode2,NoteCode3,LDB
0,RUS,1,1946,1946,1,m,0,1,RR,1,O,106450.000000,1,2,.,0
1,RUS,1,1946,1946,1,m,1,1,RR,1,O,21379.000000,1,.,.,0
2,RUS,1,1946,1946,1,m,2,1,RR,1,O,8662.000000,1,.,.,0
3,RUS,1,1946,1946,1,m,3,1,RR,1,O,5016.000000,1,.,.,0
4,RUS,1,1946,1946,1,m,4,1,RR,1,O,5297.000000,1,.,.,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17209,RUS,1,2014,2014,1,f,99,1,TL,42,O,805.619469,.,.,.,1
17210,RUS,1,2014,2014,1,f,99,1,TU,42,O,741.380531,.,.,.,1
17211,RUS,1,2014,2014,1,f,100,+,RR,42,O,2828.000000,.,.,.,1
17212,RUS,1,2014,2014,1,f,UNK,.,.,42,O,912.000000,.,.,.,1


In [3]:
# Check the dataset
df.dtypes

PopName          object
Area              int64
Year              int64
YearReg           int64
YearInterval      int64
Sex              object
Age              object
AgeInterval      object
Lexis            object
RefCode           int64
Access           object
Deaths          float64
NoteCode1        object
NoteCode2        object
NoteCode3        object
LDB               int64
dtype: object

In [4]:
df['Age'].unique()

array(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23',
       '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34',
       '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45',
       '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56',
       '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67',
       '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78',
       '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89',
       '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100',
       'UNK', 'TOT'], dtype=object)

In [5]:
# Delete the bad data ('UNK')
df = df.drop(df[df['Age'] == 'UNK'].index, axis=0)
df = df.drop(df[df['Age'] == 'TOT'].index, axis=0)

In [6]:
# Transfer the data type of Age to int
df = df.astype({'Year':'int64'})
df.dtypes

PopName          object
Area              int64
Year              int64
YearReg           int64
YearInterval      int64
Sex              object
Age              object
AgeInterval      object
Lexis            object
RefCode           int64
Access           object
Deaths          float64
NoteCode1        object
NoteCode2        object
NoteCode3        object
LDB               int64
dtype: object

### 2.Treemap

In [7]:
#2.1 filter the data we need (Year 1940-2019)
df2 = df[(df['Year'] >= 1940) & (df['Year'] <= 2019)][['PopName', 'Year', 'Deaths']]
df2

Unnamed: 0,PopName,Year,Deaths
0,RUS,1946,106450.000000
1,RUS,1946,21379.000000
2,RUS,1946,8662.000000
3,RUS,1946,5016.000000
4,RUS,1946,5297.000000
...,...,...,...
17207,RUS,2014,1020.916080
17208,RUS,2014,931.083920
17209,RUS,2014,805.619469
17210,RUS,2014,741.380531


In [8]:
#2.2 Categorize the year into 8 groups
bins = [1940, 1949, 1959, 1969, 1979, 1989, 1999, 2009, 2019]
labels = ['1940s', '1950s', '1960s', '1970s','1980s','1990s','2000s', '2010s']
df2['YearGroup'] = pd.cut(df2['Year'],bins=bins,labels=labels, include_lowest=True)
df2

Unnamed: 0,PopName,Year,Deaths,YearGroup
0,RUS,1946,106450.000000,1940s
1,RUS,1946,21379.000000,1940s
2,RUS,1946,8662.000000,1940s
3,RUS,1946,5016.000000,1940s
4,RUS,1946,5297.000000,1940s
...,...,...,...,...
17207,RUS,2014,1020.916080,2010s
17208,RUS,2014,931.083920,2010s
17209,RUS,2014,805.619469,2010s
17210,RUS,2014,741.380531,2010s


In [9]:
#2.3 Group and calculate the total deaths in each group of year (Treemap)
group = df2.groupby(['YearGroup'])
df2['Total_Deaths'] = group['Deaths'].transform('sum')
df2

Unnamed: 0,PopName,Year,Deaths,YearGroup,Total_Deaths
0,RUS,1946,106450.000000,1940s,4.677599e+06
1,RUS,1946,21379.000000,1940s,4.677599e+06
2,RUS,1946,8662.000000,1940s,4.677599e+06
3,RUS,1946,5016.000000,1940s,4.677599e+06
4,RUS,1946,5297.000000,1940s,4.677599e+06
...,...,...,...,...,...
17207,RUS,2014,1020.916080,2010s,9.579022e+06
17208,RUS,2014,931.083920,2010s,9.579022e+06
17209,RUS,2014,805.619469,2010s,9.579022e+06
17210,RUS,2014,741.380531,2010s,9.579022e+06


In [10]:
df2[df2.isna().any(axis=1)]

Unnamed: 0,PopName,Year,Deaths,YearGroup,Total_Deaths


In [11]:
#2.4 Drop duplicates
df2 = df2.drop_duplicates('YearGroup')
df2 = df2.drop(['Year','Deaths'],axis=1)
df2 = df2.rename({'PopName':'Country', 'YearGroup':'Year'}, axis=1)
df2

Unnamed: 0,Country,Year,Total_Deaths
0,RUS,1940s,4677599.0
824,RUS,1950s,9634787.0
2884,RUS,1960s,9663970.0
4944,RUS,1970s,12845530.0
7004,RUS,1980s,15564780.0
9064,RUS,1990s,19927450.0
11124,RUS,2000s,21965570.0
15184,RUS,2010s,9579022.0


In [13]:
#3.1 Population dataset
df_pop = pd.read_csv('RUSpop.csv')
df_pop = df_pop[(df_pop['Year'] >= 1940) & (df_pop['Year'] <= 2019)][['PopName', 'Year', 'Population']]
df_pop

Unnamed: 0,PopName,Year,Population
0,RUS,1946,703018
1,RUS,1946,495443
2,RUS,1946,442311
3,RUS,1946,654199
4,RUS,1946,1079548
...,...,...,...
14858,RUS,2015,13281
14859,RUS,2015,6897
14860,RUS,2015,4994
14861,RUS,2015,4263


In [14]:
#3.2 Categorize the year into 8 groups
bins = [1940, 1949, 1959, 1969, 1979, 1989, 1999, 2009, 2019]
labels = ['1940s', '1950s', '1960s', '1970s','1980s','1990s','2000s', '2010s']
df_pop['YearGroup'] = pd.cut(df_pop['Year'],bins=bins,labels=labels, include_lowest=True)

#3.3 Group and calculate the total population in each group of year (Treemap)
group = df_pop.groupby(['YearGroup'])
df_pop['Total_Pop'] = group['Population'].transform('sum')
df_pop

Unnamed: 0,PopName,Year,Population,YearGroup,Total_Pop
0,RUS,1946,703018,1940s,790932680
1,RUS,1946,495443,1940s,790932680
2,RUS,1946,442311,1940s,790932680
3,RUS,1946,654199,1940s,790932680
4,RUS,1946,1079548,1940s,790932680
...,...,...,...,...,...
14858,RUS,2015,13281,2010s,1719483416
14859,RUS,2015,6897,2010s,1719483416
14860,RUS,2015,4994,2010s,1719483416
14861,RUS,2015,4263,2010s,1719483416


In [15]:
df2[df2.isna().any(axis=1)]

Unnamed: 0,Country,Year,Total_Deaths


In [16]:
#3.4 Drop duplicates
df_pop = df_pop.drop_duplicates('YearGroup')
df_pop = df_pop.drop(['PopName','Year','Population'],axis=1)
df_pop = df_pop.rename({'YearGroup':'Year'}, axis=1)
df_pop

Unnamed: 0,Year,Total_Pop
0,1940s,790932680
816,1950s,2429924948
3062,1970s,3203912971
5274,1980s,3139841144
7520,1990s,5019199950
10987,2000s,3758377804
13639,2010s,1719483416


In [17]:
#4. merge 2 datasets
ru = pd.merge(df2, df_pop, on='Year', how='inner')
ru

Unnamed: 0,Country,Year,Total_Deaths,Total_Pop
0,RUS,1940s,4677599.0,790932680
1,RUS,1950s,9634787.0,2429924948
2,RUS,1970s,12845530.0,3203912971
3,RUS,1980s,15564780.0,3139841144
4,RUS,1990s,19927450.0,5019199950
5,RUS,2000s,21965570.0,3758377804
6,RUS,2010s,9579022.0,1719483416


In [18]:
ru['Death_Rate'] = ru[['Total_Deaths','Total_Pop']].apply(lambda x: x['Total_Deaths'] / x['Total_Pop']* 1000 ,axis=1)
ru

Unnamed: 0,Country,Year,Total_Deaths,Total_Pop,Death_Rate
0,RUS,1940s,4677599.0,790932680,5.914029
1,RUS,1950s,9634787.0,2429924948,3.965055
2,RUS,1970s,12845530.0,3203912971,4.009325
3,RUS,1980s,15564780.0,3139841144,4.957186
4,RUS,1990s,19927450.0,5019199950,3.970244
5,RUS,2000s,21965570.0,3758377804,5.844428
6,RUS,2010s,9579022.0,1719483416,5.570872


In [19]:
ru.to_csv('D:/Users/Lenovo/Desktop/COMP4037 Research Methods/CW2_Data Visualization/Data/Russia rate.csv', index = False)