In [1]:
import pandas as pd
import numpy as np

## Importing the Cancer Data

In [2]:
CancerData = pd.read_csv("../data/canada_cancer_data.zip",compression='zip',sep=',')

In [3]:
CancerData.columns

Index(['Unnamed: 0', 'REF_DATE', 'GEO', 'Age Group', 'Sex',
       'Primary types of cancer (ICD-O-3)', 'Prevalence duration',
       'Characteristics', 'VALUE'],
      dtype='object')

## Choosing the columns to drop down
### DGUID, UOM, UOM_ID, SCALAR_FACTOR, SCALAR_ID, VECTOR, COORDINATE, STATUS, SYMBOL, TERMINATED, DECIMALS

In [4]:
DropCols = ['DGUID', 'UOM', 'UOM_ID', 'SCALAR_FACTOR', 'SCALAR_ID', 'VECTOR', 'COORDINATE', 'STATUS', 'SYMBOL', 'TERMINATED', 'DECIMALS']

CancerData.drop(DropCols,axis=1,inplace=True) # implace will modify the data itself

In [5]:
CancerData.head()

Unnamed: 0,REF_DATE,GEO,Age Group,Sex,Primary types of cancer (ICD-O-3),Prevalence duration,Characteristics,VALUE
0,1994,Canada,"Total, all ages",Both sexes,"Total, all primary sites of cancer [C00.0-C80.9]",2-year duration,Total number of cancer cases,165090.0
1,1994,Canada,"Total, all ages",Both sexes,"Total, all primary sites of cancer [C00.0-C80.9]",2-year duration,Cancer prevalence proportion,572.4
2,1994,Canada,"Total, all ages",Both sexes,"Total, all primary sites of cancer [C00.0-C80.9]",2-year duration,"Low 95% confidence interval, cancer prevalence",569.6
3,1994,Canada,"Total, all ages",Both sexes,"Total, all primary sites of cancer [C00.0-C80.9]",2-year duration,"High 95% confidence interval, cancer prevalence",575.1
4,1994,Canada,"Total, all ages",Both sexes,Lip [C00.0-C00.9],2-year duration,Total number of cancer cases,1165.0


## To Change the AgeGroup in to Int of the form for eg: Total, all ages -> 0, and 0-19 years becomes 1

In [6]:
AgeGroup = {i[1] : i[0]  for i in enumerate(CancerData["Age Group"].unique())}
AgeGroup

{'Total, all ages': 0,
 '0-19 years': 1,
 '20-29 years': 2,
 '30-39 years': 3,
 '40-49 years': 4,
 '50-59 years': 5,
 '60-69 years': 6,
 '70-79 years': 7,
 '80-89 years': 8,
 '90-99 years': 9}

In [7]:
CancerData["Age Group"].replace(AgeGroup,inplace=True)

In [8]:
CancerData.head()

Unnamed: 0,REF_DATE,GEO,Age Group,Sex,Primary types of cancer (ICD-O-3),Prevalence duration,Characteristics,VALUE
0,1994,Canada,0,Both sexes,"Total, all primary sites of cancer [C00.0-C80.9]",2-year duration,Total number of cancer cases,165090.0
1,1994,Canada,0,Both sexes,"Total, all primary sites of cancer [C00.0-C80.9]",2-year duration,Cancer prevalence proportion,572.4
2,1994,Canada,0,Both sexes,"Total, all primary sites of cancer [C00.0-C80.9]",2-year duration,"Low 95% confidence interval, cancer prevalence",569.6
3,1994,Canada,0,Both sexes,"Total, all primary sites of cancer [C00.0-C80.9]",2-year duration,"High 95% confidence interval, cancer prevalence",575.1
4,1994,Canada,0,Both sexes,Lip [C00.0-C00.9],2-year duration,Total number of cancer cases,1165.0


## Now we Can replace Sex with alphabets, for eg Both sexes -> B, and Male -> M and Female -> F

In [9]:
gender = {i : i[0]  for i in CancerData["Sex"].unique()}
gender

{'Both sexes': 'B', 'Males': 'M', 'Females': 'F'}

In [10]:
CancerData["Sex"].replace(gender,inplace=True)
CancerData.head()

Unnamed: 0,REF_DATE,GEO,Age Group,Sex,Primary types of cancer (ICD-O-3),Prevalence duration,Characteristics,VALUE
0,1994,Canada,0,B,"Total, all primary sites of cancer [C00.0-C80.9]",2-year duration,Total number of cancer cases,165090.0
1,1994,Canada,0,B,"Total, all primary sites of cancer [C00.0-C80.9]",2-year duration,Cancer prevalence proportion,572.4
2,1994,Canada,0,B,"Total, all primary sites of cancer [C00.0-C80.9]",2-year duration,"Low 95% confidence interval, cancer prevalence",569.6
3,1994,Canada,0,B,"Total, all primary sites of cancer [C00.0-C80.9]",2-year duration,"High 95% confidence interval, cancer prevalence",575.1
4,1994,Canada,0,B,Lip [C00.0-C00.9],2-year duration,Total number of cancer cases,1165.0


## For the Cancer Names let us 

In [13]:
CancerNames = {i[1] : i[0] for i in enumerate(CancerData["Primary types of cancer (ICD-O-3)"].unique())}
CancerNames.values()

dict_values([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57])

In [14]:
CancerData["Primary types of cancer (ICD-O-3)"].replace(CancerNames,inplace=True)
CancerData.head()

Unnamed: 0,REF_DATE,GEO,Age Group,Sex,Primary types of cancer (ICD-O-3),Prevalence duration,Characteristics,VALUE
0,1994,Canada,0,B,0,2-year duration,Total number of cancer cases,165090.0
1,1994,Canada,0,B,0,2-year duration,Cancer prevalence proportion,572.4
2,1994,Canada,0,B,0,2-year duration,"Low 95% confidence interval, cancer prevalence",569.6
3,1994,Canada,0,B,0,2-year duration,"High 95% confidence interval, cancer prevalence",575.1
4,1994,Canada,0,B,1,2-year duration,Total number of cancer cases,1165.0


In [15]:
PrevalenceDuration = {i : i.split('-')[0] for i in CancerData["Prevalence duration"].unique()}
PrevalenceDuration

{'2-year duration': '2',
 '5-year duration': '5',
 '10-year duration': '10',
 '20-year duration': '20'}

In [16]:
CancerData["Prevalence duration"].replace(PrevalenceDuration,inplace=True)
CancerData.head()

Unnamed: 0,REF_DATE,GEO,Age Group,Sex,Primary types of cancer (ICD-O-3),Prevalence duration,Characteristics,VALUE
0,1994,Canada,0,B,0,2,Total number of cancer cases,165090.0
1,1994,Canada,0,B,0,2,Cancer prevalence proportion,572.4
2,1994,Canada,0,B,0,2,"Low 95% confidence interval, cancer prevalence",569.6
3,1994,Canada,0,B,0,2,"High 95% confidence interval, cancer prevalence",575.1
4,1994,Canada,0,B,1,2,Total number of cancer cases,1165.0


In [17]:
Characteristics = {'Total number of cancer cases': 'T','Cancer prevalence proportion' : 'P',
       'Low 95% confidence interval, cancer prevalence' : 'L',
       'High 95% confidence interval, cancer prevalence': 'H'}
Characteristics

{'Total number of cancer cases': 'T',
 'Cancer prevalence proportion': 'P',
 'Low 95% confidence interval, cancer prevalence': 'L',
 'High 95% confidence interval, cancer prevalence': 'H'}

In [18]:
CancerData["Characteristics"].replace(Characteristics,inplace=True)
CancerData.head()

Unnamed: 0,REF_DATE,GEO,Age Group,Sex,Primary types of cancer (ICD-O-3),Prevalence duration,Characteristics,VALUE
0,1994,Canada,0,B,0,2,T,165090.0
1,1994,Canada,0,B,0,2,P,572.4
2,1994,Canada,0,B,0,2,L,569.6
3,1994,Canada,0,B,0,2,H,575.1
4,1994,Canada,0,B,1,2,T,1165.0


In [19]:
CancerData.to_csv("Cancer_Data_PreProcessed.csv",sep=',')

In [42]:
import zipfile
zipfile.ZipFile("Cancer_Data_PreProcessed.zip",'w').write("Cancer_Data_PreProcessed.csv")

# Now for convience we can save all the dictionaries into a pickle file which can go also as a resource 
## The format is as follows Dict({ColumnName : ModifiedColumnName})

In [21]:
CancerData.columns

Index(['REF_DATE', 'GEO', 'Age Group', 'Sex',
       'Primary types of cancer (ICD-O-3)', 'Prevalence duration',
       'Characteristics', 'VALUE'],
      dtype='object')

In [24]:
RefDate = {i : i for i in CancerData["REF_DATE"].unique()}
RefDate.items()

dict_items([(1994, 1994), (1995, 1995), (1996, 1996), (1997, 1997), (1998, 1998), (1999, 1999), (2000, 2000), (2001, 2001), (2002, 2002), (2003, 2003), (2004, 2004), (2005, 2005), (2006, 2006), (2007, 2007), (2008, 2008), (2009, 2009), (2010, 2010), (2011, 2011), (2012, 2012), (2013, 2013), (2014, 2014), (2015, 2015)])

In [25]:
GeoName = {i : i for i in CancerData["GEO"].unique()}
GeoName.items()

dict_items([('Canada', 'Canada'), ('Canada (excluding Quebec)', 'Canada (excluding Quebec)'), ('Newfoundland and Labrador', 'Newfoundland and Labrador'), ('Prince Edward Island', 'Prince Edward Island'), ('Nova Scotia', 'Nova Scotia'), ('New Brunswick', 'New Brunswick'), ('Quebec', 'Quebec'), ('Ontario', 'Ontario'), ('Manitoba', 'Manitoba'), ('Saskatchewan', 'Saskatchewan'), ('Alberta', 'Alberta'), ('British Columbia', 'British Columbia'), ('Yukon', 'Yukon'), ('Northwest Territories', 'Northwest Territories'), ('Nunavut', 'Nunavut')])

In [28]:
AgeGroup.items()

dict_items([('Total, all ages', 0), ('0-19 years', 1), ('20-29 years', 2), ('30-39 years', 3), ('40-49 years', 4), ('50-59 years', 5), ('60-69 years', 6), ('70-79 years', 7), ('80-89 years', 8), ('90-99 years', 9)])

In [29]:
gender.items()

dict_items([('Both sexes', 'B'), ('Males', 'M'), ('Females', 'F')])

In [30]:
CancerNames.items()

dict_items([('Total, all primary sites of cancer ', 0), ('Lip ', 1), ('Tongue ', 2), ('Salivary gland ', 3), ('Floor of mouth ', 4), ('Gum and other mouth ', 5), ('Nasopharynx ', 6), ('Oropharynx ', 7), ('Hypopharynx ', 8), ('Other oral cavity and pharynx ', 9), ('Esophagus ', 10), ('Stomach ', 11), ('Small intestine ', 12), ('Colon and rectum ', 13), ('Colon excluding rectum ', 14), ('Rectum and rectosigmoid ', 15), ('Anus, anal canal and anorectum ', 16), ('Liver ', 17), ('Gallbladder ', 18), ('Pancreas ', 19), ('Other digestive system ', 20), ('Larynx ', 21), ('Lung and bronchus ', 22), ('Other respiratory system ', 23), ('Bones and joints ', 24), ('Soft tissue (including heart) ', 25), ('Melanomas of the skin ', 26), ('Other non-epithelial skin ', 27), ('Breast ', 28), ('Cervix uteri ', 29), ('Corpus uteri ', 30), ('Uterus, not otherwise specified ', 31), ('Ovary ', 32), ('Other female genital system ', 33), ('Prostate ', 34), ('Testis ', 35), ('Penis ', 36), ('Other male genital o

In [31]:
PrevalenceDuration.items()

dict_items([('2-year duration', '2'), ('5-year duration', '5'), ('10-year duration', '10'), ('20-year duration', '20')])

In [32]:
Characteristics.items()

dict_items([('Total number of cancer cases', 'T'), ('Cancer prevalence proportion', 'P'), ('Low 95% confidence interval, cancer prevalence', 'L'), ('High 95% confidence interval, cancer prevalence', 'H')])

In [33]:
ColumnNamesDict = {'YEAR' : RefDate, 'GEO' : GeoName,'AGE' : AgeGroup, 'SEX' : gender,
                   'CANCER_NAMES' : CancerNames, 'PREVALENCE_DURATION' : PrevalenceDuration, 'CHARACTERISTICS': Characteristics}


In [39]:
import pickle as pkl 
file = open("MasterDict.pickle",'wb')
pkl.dump(ColumnNamesDict,file)
file.close()

In [41]:
out = open("MasterDict.pickle",'rb')
Dict = pkl.load(out)
out.close()
Dict

{'YEAR': {1994: 1994,
  1995: 1995,
  1996: 1996,
  1997: 1997,
  1998: 1998,
  1999: 1999,
  2000: 2000,
  2001: 2001,
  2002: 2002,
  2003: 2003,
  2004: 2004,
  2005: 2005,
  2006: 2006,
  2007: 2007,
  2008: 2008,
  2009: 2009,
  2010: 2010,
  2011: 2011,
  2012: 2012,
  2013: 2013,
  2014: 2014,
  2015: 2015},
 'GEO': {'Canada': 'Canada',
  'Canada (excluding Quebec)': 'Canada (excluding Quebec)',
  'Newfoundland and Labrador': 'Newfoundland and Labrador',
  'Prince Edward Island': 'Prince Edward Island',
  'Nova Scotia': 'Nova Scotia',
  'New Brunswick': 'New Brunswick',
  'Quebec': 'Quebec',
  'Ontario': 'Ontario',
  'Manitoba': 'Manitoba',
  'Saskatchewan': 'Saskatchewan',
  'Alberta': 'Alberta',
  'British Columbia': 'British Columbia',
  'Yukon': 'Yukon',
  'Northwest Territories': 'Northwest Territories',
  'Nunavut': 'Nunavut'},
 'AGE': {'Total, all ages': 0,
  '0-19 years': 1,
  '20-29 years': 2,
  '30-39 years': 3,
  '40-49 years': 4,
  '50-59 years': 5,
  '60-69 years': 

In [5]:
CancerData.head()

Unnamed: 0.1,Unnamed: 0,REF_DATE,GEO,Age Group,Sex,Primary types of cancer (ICD-O-3),Prevalence duration,Characteristics,VALUE
0,0,1994,Canada,0,B,0,2,T,165090.0
1,1,1994,Canada,0,B,0,2,P,572.4
2,2,1994,Canada,0,B,0,2,L,569.6
3,3,1994,Canada,0,B,0,2,H,575.1
4,4,1994,Canada,0,B,1,2,T,1165.0


## More formating on the data

In [10]:
df = CancerData[CancerData['Characteristics'].isin(['T', 'P'])]

In [11]:
df.head()

Unnamed: 0.1,Unnamed: 0,REF_DATE,GEO,Age Group,Sex,Primary types of cancer (ICD-O-3),Prevalence duration,Characteristics,VALUE
0,0,1994,Canada,0,B,0,2,T,165090.0
1,1,1994,Canada,0,B,0,2,P,572.4
4,4,1994,Canada,0,B,1,2,T,1165.0
5,5,1994,Canada,0,B,1,2,P,4.0
8,8,1994,Canada,0,B,2,2,T,820.0


In [15]:
import pickle as pkl

In [16]:
pickle_file =open("../data/MasterDict.pickle",'rb')
MasterDict = pkl.load(pickle_file)
pickle_file.close()

In [37]:
MasterDict['CHARACTERISTICS'] = {'Total number of cancer cases': 'T',
 'Cancer prevalence proportion': 'P'}

In [38]:
MasterDictInverse = {}

for columns, rows in MasterDict.items():
    MasterDictInverse[columns] = {value:key for key, value in rows.items()}


In [33]:
MasterDictInverse

{'YEAR': {1994: 1994,
  1995: 1995,
  1996: 1996,
  1997: 1997,
  1998: 1998,
  1999: 1999,
  2000: 2000,
  2001: 2001,
  2002: 2002,
  2003: 2003,
  2004: 2004,
  2005: 2005,
  2006: 2006,
  2007: 2007,
  2008: 2008,
  2009: 2009,
  2010: 2010,
  2011: 2011,
  2012: 2012,
  2013: 2013,
  2014: 2014,
  2015: 2015},
 'GEO': {'Canada': 'Canada',
  'Canada (excluding Quebec)': 'Canada (excluding Quebec)',
  'Newfoundland and Labrador': 'Newfoundland and Labrador',
  'Prince Edward Island': 'Prince Edward Island',
  'Nova Scotia': 'Nova Scotia',
  'New Brunswick': 'New Brunswick',
  'Quebec': 'Quebec',
  'Ontario': 'Ontario',
  'Manitoba': 'Manitoba',
  'Saskatchewan': 'Saskatchewan',
  'Alberta': 'Alberta',
  'British Columbia': 'British Columbia',
  'Yukon': 'Yukon',
  'Northwest Territories': 'Northwest Territories',
  'Nunavut': 'Nunavut'},
 'AGE': {0: 'Total, all ages',
  1: '0-19 years',
  2: '20-29 years',
  3: '30-39 years',
  4: '40-49 years',
  5: '50-59 years',
  6: '60-69 years

In [45]:
file = open("MasterDictInverse.pickle",'wb')
pkl.dump(MasterDictInverse,file)
file.close()

In [31]:
df.to_csv("Cancer_Data_PreProcessed_v2.csv")

In [46]:
file = open("MasterDict.pickle",'wb')
pkl.dump(MasterDict,file)
file.close()

In [32]:
import zipfile
zipfile.ZipFile("../data/Cancer_Data_PreProcessed_v2.zip",'w').write("Cancer_Data_PreProcessed_v2.csv")

In [30]:
pwd

'/home/siro/ML/Dash/DeepRegina/Canada_Cancer_Data/notebooks'

In [44]:
MasterDictInverse['CHARACTERISTICS']

{'T': 'Total number of cancer cases', 'P': 'Cancer prevalence proportion'}