# scrap RKI data
Incidence RKI 
https://www.krebsdaten.de/Krebs/SiteGlobals/Forms/Datenbankabfrage/datenbankabfrage_stufe2_form.html

Survival
https://www.krebsdaten.de/Krebs/SiteGlobals/Forms/Datenbankabfrage/datenbankabfrage_stufe2_form.html

In [1]:
import pandas as pd
import numpy as np

In [77]:
mapping = {
    'Krebs gesamt (C00-C97 ohne C44)': 'All Sites',
    'Verdauungsorgane (C15-C26)': 'Digestive System',
    'Darm (C18-C20)': 'Colon and Rectum', # bowel intestine, gut
    'Bauchspeicheldrüse (C25)': 'Pancreas',
    'Atmungs- und Brustorgane (C30-C39)': 'Respiratory System',
    'Leber (C22)': 'Liver', # missig in RKI incidence
    'Lunge (C33-C34)': 'Lung and Bronchus',
    'Malignes Melanom der Haut (C43)': 'Melanoma of the Skin',
    'Brustdrüse (C50)': 'Breast',
    'weibliche Geschlechtsorgane (C51-C58)': 'Female Genital System',
    'Gebärmutterhals (C53)': 'Cervix Uteri',
    'Gebärmutterkörper (C54-C55)': 'Corpus Uteri',
    'männliche Geschlechtsorgane (C60-C63)': 'Male Genital System',
    'Prostata (C61)': 'Prostate',
    'Harnorgane (C64-C68)': 'Urinary System',
    'Niere (C64)': 'Kidney and Renal Pelvis',
    'Harnblase (C67)': 'Urinary Bladder',
    'Non-Hodgkin-Lymphome (C82-C88)': 'Non-Hodgkin Lymphoma',
}
mapping_incidence_age = {
    '0 - 4' :'00-14',
    '5 - 9': '00-14',
    '10 - 14':'00-14',
    
    '15 - 19':'15-34',
    '20 - 24':'15-34',
    '25 - 29':'15-34',
    '30 - 34':'15-34',
    
    '35 - 39':'35-44',
    '40 - 44':'35-44',
    
    '45 - 49':'45-54',
    '50 - 54':'45-54',
    
    '55 - 59':'55-64', 
    '60 - 64':'55-64', 
    '65 - 69':'65-74',
    '70 - 74':'65-74',
    '75 - 79':'75+',
    '80 - 84':'75+',
    '85+':'75+',
}

mapping_survival_age = {
    '15 - 44':'00-44',
    '45 - 54':'45-54',
    '55 - 64':'55-64',
    '65 - 74':'65-74',
    '75 und älter':'75+',
}

mapping_sex = {
    'weiblich':'Female',
    'männlich':'Male',
}

In [79]:
RKI_incidence = pd.read_csv('Incidence_Krebsdaten.csv', delimiter=';')
print(df.columns)
RKI_incidence.rename(columns={'Unnamed: 0':'Cancer_site', 
                   'Unnamed: 1': 'Sex', 
                   'Unnamed: 2': 'Age_group',
                  }, inplace=True)
RKI_incidence['cancer_site_en'] = RKI_incidence['Cancer_site'].map(mapping)
RKI_incidence['Age_group_SEER'] = RKI_incidence['Age_group'].map(mapping_incidence_age)
RKI_incidence['Sex_en'] = RKI_incidence['Sex'].map(mapping_sex)
RKI_incidence.drop('Unnamed: 24', axis=1, inplace=True)
for i in range(1999, 2020):
    RKI_incidence[str(i)].replace(',','.', regex=True, inplace=True)
    RKI_incidence[str(i)] = pd.to_numeric(RKI_incidence[str(i)], downcast='float', errors='coerce')
print(np.unique(RKI_incidence.Age_group))
RKI_incidence

Index(['age_at_diagnosis', 'survival_interval', 'statistic_type', 'site',
       'year_of_diagnosis', 'sex', 'survival_prob'],
      dtype='object')
['0 - 4' '10 - 14' '15 - 19' '20 - 24' '25 - 29' '30 - 34' '35 - 39'
 '40 - 44' '45 - 49' '5 - 9' '50 - 54' '55 - 59' '60 - 64' '65 - 69'
 '70 - 74' '75 - 79' '80 - 84' '85+']


Unnamed: 0,Cancer_site,Sex,Age_group,1999,2000,2001,2002,2003,2004,2005,...,2013,2014,2015,2016,2017,2018,2019,cancer_site_en,Age_group_SEER,Sex_en
0,Krebs gesamt (C00-C97 ohne C44),weiblich,0 - 4,18.500000,20.000000,18.100000,18.600000,15.500000,17.799999,19.900000,...,19.700001,19.100000,19.900000,19.799999,18.500000,19.600000,19.900000,All Sites,00-14,Female
1,Krebs gesamt (C00-C97 ohne C44),weiblich,5 - 9,8.400000,9.400000,10.100000,8.200000,8.900000,9.900000,9.200000,...,7.700000,8.800000,10.400000,9.000000,10.200000,8.800000,8.600000,All Sites,00-14,Female
2,Krebs gesamt (C00-C97 ohne C44),weiblich,10 - 14,9.500000,9.600000,8.900000,10.200000,9.200000,10.300000,9.700000,...,11.200000,10.700000,10.200000,10.700000,10.600000,12.100000,11.300000,All Sites,00-14,Female
3,Krebs gesamt (C00-C97 ohne C44),weiblich,15 - 19,18.100000,18.500000,19.200001,19.100000,19.100000,19.600000,20.900000,...,21.799999,21.100000,20.400000,20.299999,19.799999,19.200001,18.700001,All Sites,15-34,Female
4,Krebs gesamt (C00-C97 ohne C44),weiblich,20 - 24,34.400002,33.400002,34.500000,37.900002,32.099998,32.400002,33.000000,...,37.599998,35.200001,35.000000,35.400002,30.200001,32.000000,29.799999,All Sites,15-34,Female
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
607,Non-Hodgkin-Lymphome (C82-C88),männlich,65 - 69,41.700001,51.900002,49.000000,41.700001,45.700001,49.500000,48.000000,...,60.400002,53.200001,55.599998,55.400002,56.400002,49.500000,55.099998,Non-Hodgkin Lymphoma,65-74,Male
608,Non-Hodgkin-Lymphome (C82-C88),männlich,70 - 74,63.299999,56.799999,55.099998,61.400002,69.900002,68.000000,69.599998,...,74.099998,78.599998,80.099998,77.400002,77.000000,78.000000,75.300003,Non-Hodgkin Lymphoma,65-74,Male
609,Non-Hodgkin-Lymphome (C82-C88),männlich,75 - 79,89.500000,81.000000,76.599998,80.000000,87.000000,80.199997,89.199997,...,95.000000,94.900002,99.500000,100.199997,102.199997,98.199997,102.500000,Non-Hodgkin Lymphoma,75+,Male
610,Non-Hodgkin-Lymphome (C82-C88),männlich,80 - 84,101.699997,72.199997,87.300003,68.800003,89.800003,93.699997,100.400002,...,108.500000,112.599998,114.199997,114.800003,117.000000,111.300003,114.300003,Non-Hodgkin Lymphoma,75+,Male


In [92]:
RKI_incidence2 = RKI_incidence.groupby(by=['cancer_site_en', 'Age_group_SEER', 'Sex_en']).sum().reset_index()
RKI_incidence2.drop('Cancer_site', axis=1, inplace=True)
RKI_incidence2.drop('Sex', axis=1, inplace=True)
RKI_incidence2.drop('Age_group', axis=1, inplace=True)
RKI_incidence2

Unnamed: 0,cancer_site_en,Age_group_SEER,Sex_en,1999,2000,2001,2002,2003,2004,2005,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,All Sites,00-14,Female,36.400002,39.000000,37.099998,37.000000,33.599998,38.000000,38.799999,...,37.599998,39.599998,39.500000,38.599998,38.599998,40.500000,39.500000,39.299999,40.500000,39.799999
1,All Sites,00-14,Male,42.599998,44.799999,42.599998,41.700001,45.200001,47.299999,44.900002,...,46.200001,44.699997,46.700001,48.900002,46.099998,48.000000,45.000000,50.000000,45.300003,44.199997
2,All Sites,15-34,Female,196.399994,204.300003,209.500000,216.399994,210.399994,207.199997,220.600006,...,238.299988,234.000000,237.000000,234.899994,232.099991,232.700012,231.000000,221.600006,218.500000,206.300003
3,All Sites,15-34,Male,164.500000,162.600006,166.200012,171.500000,169.800003,176.000000,188.299988,...,187.200012,194.699997,190.399994,200.699997,193.100006,194.100006,186.000000,176.899994,184.399994,178.399994
4,All Sites,35-44,Female,390.000000,384.899994,390.399994,386.400024,389.200012,398.899994,406.900024,...,443.100006,449.100006,444.700012,446.299988,443.700012,439.299988,437.700012,436.500000,433.799988,437.400024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233,Urinary System,55-64,Male,204.300003,200.399994,199.199997,192.399994,187.500000,186.000000,183.500000,...,178.899994,176.500000,169.000000,168.000000,167.800003,168.399994,161.500000,160.800003,164.000000,155.799988
234,Urinary System,65-74,Female,138.700012,145.700012,142.500000,146.500000,150.500000,139.800003,140.799988,...,135.399994,126.699997,132.000000,124.800003,126.699997,122.399994,118.699997,121.800003,123.900002,117.599998
235,Urinary System,65-74,Male,438.799988,452.500000,438.100006,432.400024,436.200012,417.700012,396.399994,...,366.299988,355.100006,358.700012,362.700012,340.799988,344.500000,346.299988,335.900024,343.100006,341.099976
236,Urinary System,75+,Female,339.400024,310.799988,329.899994,322.600006,343.600006,357.700012,346.600006,...,359.799988,339.500000,342.200012,333.899994,316.100006,330.500000,314.100006,294.700012,291.299988,287.899994


In [93]:

RKI_incidence3 = RKI_incidence2.melt(id_vars=['cancer_site_en', 'Age_group_SEER', 'Sex_en',],
                  var_name='Year',
                   value_name='Incidence'
                  )
RKI_incidence3

Unnamed: 0,cancer_site_en,Age_group_SEER,Sex_en,Year,Incidence
0,All Sites,00-14,Female,1999,36.400002
1,All Sites,00-14,Male,1999,42.599998
2,All Sites,15-34,Female,1999,196.399994
3,All Sites,15-34,Male,1999,164.500000
4,All Sites,35-44,Female,1999,390.000000
...,...,...,...,...,...
4993,Urinary System,55-64,Male,2019,155.799988
4994,Urinary System,65-74,Female,2019,117.599998
4995,Urinary System,65-74,Male,2019,341.099976
4996,Urinary System,75+,Female,2019,287.899994


In [24]:
df.iloc[1]['1999']

'8,4'

In [37]:
df['2019'].replace(',','.', regex=True, inplace=True)

In [42]:
df['2019'] = pd.to_numeric(df['2019'], downcast='float', errors='coerce')
df['2019']

0       19.900000
1        8.600000
2       11.300000
3       18.700001
4       29.799999
          ...    
607     55.099998
608     75.300003
609    102.500000
610    114.300003
611    111.699997
Name: 2019, Length: 612, dtype: float32

# survival

In [85]:
RKI_survival = pd.read_csv('Survival_Krebsdaten.csv', delimiter=';')
print(RKI_survival.columns)
RKI_survival.rename(columns={'Unnamed: 0':'Cancer_site', 
                   'Unnamed: 1': 'Sex', 
                   'Unnamed: 2': 'Age_group',
                   'Unnamed: 3': 'Survival_interval'
                  }, inplace=True)
RKI_survival['cancer_site_en'] = RKI_survival['Cancer_site'].map(mapping)
RKI_survival.drop('Unnamed: 10', axis=1, inplace=True)
for y in ['2007-2008','2009-2010', '2011-2012', '2013-2014', '2015-2016', '2017-2018',]:
    RKI_survival[y].replace(',','.', regex=True, inplace=True)
    RKI_survival[y] = pd.to_numeric(RKI_survival[y], downcast='float', errors='coerce')
print(np.unique(RKI_survival.Age_group))
RKI_survival['Age_group_SEER'] = RKI_survival['Age_group'].map(mapping_survival_age)
RKI_survival['Sex_en'] = RKI_survival['Sex'].map(mapping_sex)
RKI_survival

Index(['Unnamed: 0', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', '2007-2008',
       '2009-2010', '2011-2012', '2013-2014', '2015-2016', '2017-2018',
       'Unnamed: 10'],
      dtype='object')
['15 - 44' '45 - 54' '55 - 64' '65 - 74' '75 und älter'
 'Rohe Rate - 15 und Älter']


Unnamed: 0,Cancer_site,Sex,Age_group,Survival_interval,2007-2008,2009-2010,2011-2012,2013-2014,2015-2016,2017-2018,cancer_site_en,Age_group_SEER,Sex_en
0,Krebs gesamt (C00-C97 ohne C44),weiblich,15 - 44,1,95.0,95.0,96.0,96.0,96.0,96.0,All Sites,00-44,Female
1,Krebs gesamt (C00-C97 ohne C44),weiblich,15 - 44,2,90.0,91.0,92.0,92.0,93.0,93.0,All Sites,00-44,Female
2,Krebs gesamt (C00-C97 ohne C44),weiblich,15 - 44,3,87.0,88.0,89.0,89.0,91.0,91.0,All Sites,00-44,Female
3,Krebs gesamt (C00-C97 ohne C44),weiblich,15 - 44,4,85.0,86.0,87.0,87.0,89.0,89.0,All Sites,00-44,Female
4,Krebs gesamt (C00-C97 ohne C44),weiblich,15 - 44,5,83.0,84.0,86.0,86.0,87.0,87.0,All Sites,00-44,Female
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1675,Non-Hodgkin-Lymphome (C82-C88),männlich,Rohe Rate - 15 und Älter,6,,,54.0,55.0,55.0,56.0,Non-Hodgkin Lymphoma,,Male
1676,Non-Hodgkin-Lymphome (C82-C88),männlich,Rohe Rate - 15 und Älter,7,,,51.0,51.0,52.0,53.0,Non-Hodgkin Lymphoma,,Male
1677,Non-Hodgkin-Lymphome (C82-C88),männlich,Rohe Rate - 15 und Älter,8,,,48.0,48.0,50.0,50.0,Non-Hodgkin Lymphoma,,Male
1678,Non-Hodgkin-Lymphome (C82-C88),männlich,Rohe Rate - 15 und Älter,9,,,46.0,45.0,47.0,47.0,Non-Hodgkin Lymphoma,,Male


In [87]:
RKI_survival2 = RKI_survival.groupby(by=['cancer_site_en', 'Age_group_SEER', 'Sex', 'Survival_interval']).sum().reset_index()
RKI_survival2.drop('Cancer_site', axis=1, inplace=True)
RKI_survival2.drop('Sex', axis=1, inplace=True)
RKI_survival2.drop('Age_group', axis=1, inplace=True)
RKI_survival2

Unnamed: 0,cancer_site_en,Age_group_SEER,Survival_interval,2007-2008,2009-2010,2011-2012,2013-2014,2015-2016,2017-2018,Sex_en
0,All Sites,00-44,1,90.0,91.0,92.0,92.0,92.0,94.0,Male
1,All Sites,00-44,2,84.0,85.0,86.0,87.0,88.0,90.0,Male
2,All Sites,00-44,3,80.0,82.0,83.0,84.0,85.0,87.0,Male
3,All Sites,00-44,4,78.0,79.0,81.0,83.0,83.0,85.0,Male
4,All Sites,00-44,5,77.0,78.0,80.0,81.0,82.0,84.0,Male
...,...,...,...,...,...,...,...,...,...,...
1295,Urinary Bladder,75+,6,0.0,0.0,21.0,25.0,22.0,22.0,Female
1296,Urinary Bladder,75+,7,0.0,0.0,19.0,22.0,20.0,19.0,Female
1297,Urinary Bladder,75+,8,0.0,0.0,16.0,20.0,18.0,17.0,Female
1298,Urinary Bladder,75+,9,0.0,0.0,15.0,18.0,16.0,15.0,Female


In [140]:
RKI_survival3 = RKI_survival2.melt(id_vars=['cancer_site_en', 'Age_group_SEER', 'Sex_en', 'Survival_interval'],
                  var_name='Year',
                   value_name='Survival'
                  )
RKI_survival3.rename(columns={'cancer_site_en': 'site',
                               'Age_group_SEER':'age_at_diagnosis',
                               'Sex_en':'sex',
                               'Survival':'Survival_RKI',
                               'Survival_interval':'survival_interval',
                               'Year':'year_of_diagnosis',
                              }, inplace=True)
print(np.unique(RKI_survival3.year_of_diagnosis))
RKI_survival3

['2007-2008' '2009-2010' '2011-2012' '2013-2014' '2015-2016' '2017-2018']


Unnamed: 0,site,age_at_diagnosis,sex,survival_interval,year_of_diagnosis,Survival_RKI
0,All Sites,00-44,Male,1,2007-2008,90.0
1,All Sites,00-44,Male,2,2007-2008,84.0
2,All Sites,00-44,Male,3,2007-2008,80.0
3,All Sites,00-44,Male,4,2007-2008,78.0
4,All Sites,00-44,Male,5,2007-2008,77.0
...,...,...,...,...,...,...
7795,Urinary Bladder,75+,Female,6,2017-2018,22.0
7796,Urinary Bladder,75+,Female,7,2017-2018,19.0
7797,Urinary Bladder,75+,Female,8,2017-2018,17.0
7798,Urinary Bladder,75+,Female,9,2017-2018,15.0


In [141]:
mapping_diagnosis_year = {'2007-2008':[2007,2008],
                          '2009-2010':[2009,2010],
                          '2011-2012':[2011,2012],
                          '2013-2014':[2013,2014],
                          '2015-2016':[2015,2016],
                          '2017-2018':[2017,2018],}
RKI_survival3['year_of_diagnosis'] = RKI_survival3['year_of_diagnosis'].map(mapping_diagnosis_year)
RKI_survival4 = RKI_survival3.explode('year_of_diagnosis')
RKI_survival4

Unnamed: 0,site,age_at_diagnosis,sex,survival_interval,year_of_diagnosis,Survival_RKI
0,All Sites,00-44,Male,1,2007,90.0
0,All Sites,00-44,Male,1,2008,90.0
1,All Sites,00-44,Male,2,2007,84.0
1,All Sites,00-44,Male,2,2008,84.0
2,All Sites,00-44,Male,3,2007,80.0
...,...,...,...,...,...,...
7797,Urinary Bladder,75+,Female,8,2018,17.0
7798,Urinary Bladder,75+,Female,9,2017,15.0
7798,Urinary Bladder,75+,Female,9,2018,15.0
7799,Urinary Bladder,75+,Female,10,2017,13.0


In [124]:
RKI_survival3.year_of_diagnosis

0       2007-2008
1       2007-2008
2       2007-2008
3       2007-2008
4       2007-2008
          ...    
7795    2017-2018
7796    2017-2018
7797    2017-2018
7798    2017-2018
7799    2017-2018
Name: year_of_diagnosis, Length: 7800, dtype: object

# SEER data survival

In [138]:
seer_survival = pd.read_csv('../scraping/AllSurvival_mapped.csv',)

seer_survival = seer_survival[seer_survival.age_at_diagnosis != 'All ages']
seer_survival = seer_survival[seer_survival.sex != 'Male and female']
seer_survival = seer_survival[seer_survival.statistic_type == 'Relative survival']

print(np.unique(seer_survival.age_at_diagnosis))
print(np.unique(seer_survival.year_of_diagnosis))
print(np.unique(seer_survival.survival_prob))
print(np.unique(seer_survival.survival_interval))

seer_survival

['00-44' '45-54' '55-64' '65-74' '75+']
['1975-1977' '1978-1980' '1981-1983' '1984-1986' '1987-1989' '1990-1992'
 '1993-1995' '1996-1998' '1999-2001' '2002-2004' '2005-2007' '2008-2012'
 '2013-2019']
[  0.      0.437   0.521 ...  99.995 100.        nan]
[ 0  1  2  3  4  5  6  7  8  9 10]


Unnamed: 0,age_at_diagnosis,survival_interval,statistic_type,site,year_of_diagnosis,sex,survival_prob
47620,00-44,0,Relative survival,All Sites,1975-1977,Male,
47621,00-44,0,Relative survival,All Sites,1975-1977,Female,
47623,00-44,0,Relative survival,All Sites,1978-1980,Male,
47624,00-44,0,Relative survival,All Sites,1978-1980,Female,
47626,00-44,0,Relative survival,All Sites,1981-1983,Male,
...,...,...,...,...,...,...,...
282821,75+,10,Relative survival,Kaposi Sarcoma,2005-2007,Female,
282823,75+,10,Relative survival,Kaposi Sarcoma,2008-2012,Male,
282824,75+,10,Relative survival,Kaposi Sarcoma,2008-2012,Female,
282826,75+,10,Relative survival,Kaposi Sarcoma,2013-2019,Male,


In [139]:
mapping_diagnosis_year = {
    '1975-1977':np.arange(1975,1978), 
    '1978-1980':np.arange(1978,1981),
    '1981-1983':np.arange(1981,1984),
    '1984-1986':np.arange(1984,1987),
    '1987-1989':np.arange(1987,1990),
    '1990-1992':np.arange(1990,1993),
    '1993-1995':np.arange(1993,1996),
    '1996-1998':np.arange(1996,1999),
    '1999-2001':np.arange(1999,2002),
    '2002-2004':np.arange(2002,2005),
    '2005-2007':np.arange(2005,2008),
    '2008-2012':np.arange(2008,2013),
    '2013-2019':np.arange(2013,2020),
}
seer_survival['year_of_diagnosis'] = seer_survival['year_of_diagnosis'].map(mapping_diagnosis_year)
seer_survival = seer_survival.explode('year_of_diagnosis')
seer_survival[seer_survival.year_of_diagnosis >= 2007]
seer_survival

Unnamed: 0,age_at_diagnosis,survival_interval,statistic_type,site,year_of_diagnosis,sex,survival_prob
47620,00-44,0,Relative survival,All Sites,1975,Male,
47620,00-44,0,Relative survival,All Sites,1976,Male,
47620,00-44,0,Relative survival,All Sites,1977,Male,
47621,00-44,0,Relative survival,All Sites,1975,Female,
47621,00-44,0,Relative survival,All Sites,1976,Female,
...,...,...,...,...,...,...,...
282827,75+,10,Relative survival,Kaposi Sarcoma,2015,Female,
282827,75+,10,Relative survival,Kaposi Sarcoma,2016,Female,
282827,75+,10,Relative survival,Kaposi Sarcoma,2017,Female,
282827,75+,10,Relative survival,Kaposi Sarcoma,2018,Female,


In [145]:
survival = pd.merge(seer_survival, RKI_survival4, 
                     on=['site', 'age_at_diagnosis', 'sex', 'year_of_diagnosis', 'survival_interval'])
survival.to_csv('Survival_RKI_SEER.csv')
survival

Unnamed: 0,age_at_diagnosis,survival_interval,statistic_type,site,year_of_diagnosis,sex,survival_prob,Survival_RKI
0,00-44,1,Relative survival,All Sites,2007,Male,91.688,90.0
1,00-44,1,Relative survival,All Sites,2007,Female,95.574,95.0
2,00-44,1,Relative survival,All Sites,2008,Male,92.715,90.0
3,00-44,1,Relative survival,All Sites,2009,Male,92.715,91.0
4,00-44,1,Relative survival,All Sites,2010,Male,92.715,91.0
...,...,...,...,...,...,...,...,...
13195,75+,10,Relative survival,Non-Hodgkin Lymphoma,2014,Female,,19.0
13196,75+,10,Relative survival,Non-Hodgkin Lymphoma,2015,Female,,20.0
13197,75+,10,Relative survival,Non-Hodgkin Lymphoma,2016,Female,,20.0
13198,75+,10,Relative survival,Non-Hodgkin Lymphoma,2017,Female,,20.0


# SEER Indicence

In [104]:
seer_incidence = pd.read_csv('../scraping/AllIncidence_mapped.csv',)
print(np.unique(seer_incidence.age_at_diagnosis))
print(np.unique(seer_incidence.year_of_diagnosis))
seer_incidence = seer_incidence[seer_incidence.sex != 'Male and female']
seer_incidence = seer_incidence[seer_incidence.year_of_diagnosis >= 1999]
seer_incidence

['00-14' '15-34' '35-44' '45-54' '55-64' '65-74' '75+']
[1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988
 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002
 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016
 2017 2018 2019 2020]


Unnamed: 0,sex,age_at_diagnosis,site,year_of_diagnosis,incidence
34800,Male,00-14,All Sites,1999,15.8
34801,Male,00-14,All Sites,2000,16.9
34802,Male,00-14,All Sites,2001,16.7
34803,Male,00-14,All Sites,2002,18.3
34804,Male,00-14,All Sites,2003,15.4
...,...,...,...,...,...
104323,Female,75+,Non-Small Cell Lung and Bronchus,2016,295.5
104324,Female,75+,Non-Small Cell Lung and Bronchus,2017,293.6
104325,Female,75+,Non-Small Cell Lung and Bronchus,2018,276.4
104326,Female,75+,Non-Small Cell Lung and Bronchus,2019,284.6


In [106]:
RKI_incidence3.rename(columns={'cancer_site_en': 'site',
                               'Age_group_SEER':'age_at_diagnosis',
                               'Sex_en':'sex',
                               'Incidence':'Incidence_RKI',
                               'Year':'year_of_diagnosis',
                              }, inplace=True)
RKI_incidence3.year_of_diagnosis = pd.to_numeric(RKI_incidence3.year_of_diagnosis)
RKI_incidence3.year_of_diagnosis

0       1999
1       1999
2       1999
3       1999
4       1999
        ... 
4993    2019
4994    2019
4995    2019
4996    2019
4997    2019
Name: year_of_diagnosis, Length: 4998, dtype: int64

In [144]:
incidence = pd.merge(seer_incidence, RKI_incidence3, 
                     on=['site', 'age_at_diagnosis', 'sex', 'year_of_diagnosis'])
incidence.to_csv('Incidence_RKI_SEER.csv')
incidence

Unnamed: 0,sex,age_at_diagnosis,site,year_of_diagnosis,incidence,Incidence_RKI
0,Male,00-14,All Sites,1999,15.8,42.599998
1,Male,00-14,All Sites,2000,16.9,44.799999
2,Male,00-14,All Sites,2001,16.7,42.599998
3,Male,00-14,All Sites,2002,18.3,41.700001
4,Male,00-14,All Sites,2003,15.4,45.200001
...,...,...,...,...,...,...
4993,Female,75+,Non-Hodgkin Lymphoma,2015,98.6,199.500000
4994,Female,75+,Non-Hodgkin Lymphoma,2016,97.0,205.899994
4995,Female,75+,Non-Hodgkin Lymphoma,2017,95.4,203.100006
4996,Female,75+,Non-Hodgkin Lymphoma,2018,95.2,195.600006
