In [1]:
import json
import pandas
from pandas.io.json import json_normalize
import matplotlib
import tensorflow as tf

In [2]:
df = pandas.read_json('/mnt/c/Code/cancer-survival-rates/data/processed/case_studies_clean.json')

In [3]:
def values(key):
    return df[key].value_counts()
    
def desc(key):
    return df[key].describe()

def nulls(key):
    return df[key].isnull().value_counts()

In [45]:
df.columns

Index(['disease_type', 'primary_site', 'gender', 'race', 'vital_status',
       'ethnicity', 'days_to_death', 'days_to_birth', 'year_of_birth',
       'cause_of_death', 'year_of_diagnosis', 'age_at_diagnosis',
       'days_to_last_follow_up', 'tumor_grade', 'days_to_recurrence',
       'prior_malignancy'],
      dtype='object')

In [42]:
df.loc[df['days_to_death'].isnull() & df['days_to_last_follow_up'].isnull()]

Unnamed: 0,disease_type,primary_site,gender,state,race,vital_status,ethnicity,age_is_obfuscated,days_to_death,days_to_birth,year_of_birth,year_of_diagnosis,age_at_diagnosis,days_to_last_follow_up,tumor_grade,days_to_diagnosis,days_to_recurrence,prior_malignancy
0,"Epithelial Neoplasms, NOS",Breast,female,released,not reported,Not Reported,not reported,,,,,,14656.0,,Not Reported,,,
1,Adenomas and Adenocarcinomas,Bronchus and lung,male,released,white,Not Reported,not hispanic or latino,,,-18993.0,,,,,Not Reported,,,
2,Meningiomas,Meninges,male,released,white,Not Reported,not hispanic or latino,,,-21915.0,,,,,Not Reported,,,
3,Ductal and Lobular Neoplasms,Breast,female,released,other,Not Reported,hispanic or latino,,,-22645.0,,,,,Not Reported,,,
4,Adenomas and Adenocarcinomas,Colon,male,released,white,Not Reported,not hispanic or latino,,,-21915.0,,,,,Not Reported,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83479,Complex Mixed and Stromal Neoplasms,"Uterus, NOS",,,,,,,,,,,,,,,,
83480,Adenomas and Adenocarcinomas,Thyroid gland,,,,,,,,,,,,,,,,
83481,Ductal and Lobular Neoplasms,Breast,,,,,,,,,,,,,,,,
83482,Adenomas and Adenocarcinomas,"Uterus, NOS",,,,,,,,,,,,,,,,


In [43]:
values('vital_status')

Not Reported    62937
Alive           13274
Dead             6118
Unknown           358
Name: vital_status, dtype: int64

In [46]:
for x in df.columns:
    print(x)
    print(nulls(x))
    print()

disease_type
False    83085
Name: disease_type, dtype: int64

primary_site
False    83085
Name: primary_site, dtype: int64

gender
False    82687
True       398
Name: gender, dtype: int64

race
False    82687
True       398
Name: race, dtype: int64

vital_status
False    82687
True       398
Name: vital_status, dtype: int64

ethnicity
False    82687
True       398
Name: ethnicity, dtype: int64

days_to_death
True     78315
False     4770
Name: days_to_death, dtype: int64

days_to_birth
False    55988
True     27097
Name: days_to_birth, dtype: int64

year_of_birth
True     71684
False    11401
Name: year_of_birth, dtype: int64

cause_of_death
True     82885
False      200
Name: cause_of_death, dtype: int64

year_of_diagnosis
True     71566
False    11519
Name: year_of_diagnosis, dtype: int64

age_at_diagnosis
True     45584
False    37501
Name: age_at_diagnosis, dtype: int64

days_to_last_follow_up
True     66400
False    16685
Name: days_to_last_follow_up, dtype: int64

tumor_grade
Fal

In [49]:
values('primary_site')

Bronchus and lung                                                         11852
Hematopoietic and reticuloendothelial systems                              8899
Breast                                                                     8896
Colon                                                                      6885
Spinal cord, cranial nerves, and other parts of central nervous system     3850
                                                                          ...  
Trachea                                                                       7
Palate                                                                        5
Other and unspecified male genital organs                                     1
Renal pelvis                                                                  1
Blood                                                                         1
Name: primary_site, Length: 68, dtype: int64

In [58]:
df.loc[~df['primary_site'].isin(major_sites)]['primary_site'].value_counts()

Bronchus and lung                                                         11852
Hematopoietic and reticuloendothelial systems                              8899
Colon                                                                      6885
Spinal cord, cranial nerves, and other parts of central nervous system     3850
Unknown                                                                    3116
Prostate gland                                                             2225
Uterus, NOS                                                                1882
Liver and intrahepatic bile ducts                                          1606
Connective, subcutaneous and other soft tissues                            1583
Thyroid gland                                                              1440
Rectum                                                                     1104
Other and ill-defined sites                                                1074
Corpus uteri                            

In [64]:
major_sites = {
    'adrenal_gland': 'Adrenal gland',
    'bile_duct': 'Bile duct',
    'bladder': 'Bladder',
    'blood': 'Blood',
    'bone': 'Bone',
    'bone_marrow': 'Bone Marrow',
    'brain': 'Brain',
    'breast': 'Breast',
    'cervix': 'Cervix',
    'colorectoral': 'Colorectoral',
    'esophagus': 'Esophagus',
    'eye': 'Eye',
    'head_and_neck': 'Head and neck',
    'kidney': 'Kidney',
    'liver': 'Liver',
    'lung': 'Lung',
    'lymph_nodes': 'Lymph nodes',
    'nervous_system': 'Nervous system',
    'ovary': 'Ovary',
    'pancreas': 'Pancreas',
    'pleura': 'Pleura',
    'prostate': 'Prostate',
    'skin': 'Skin',
    'soft_tissue': 'Soft tissue',
    'stomach': 'Stomach',
    'testis': 'Testis',
    'thymus': 'Thymus',
    'thyroid': 'Thyroid',
    'uterus': 'Uterus',
    'other': 'Other'
}

In [66]:
major_sites['lung']

'Lung'

In [68]:
sites_dict = {
    'Bronchus and lung': major_sites['lung'],
    'Hematopoietic and reticuloendothelial systems': major_sites['other'], # maybe bone marrow? partially lymph related
    'Colon': major_sites['colorectoral'],
    'Spinal cord, cranial nerves, and other parts of central nervous system': major_sites['nervous_system'],
    'Unknown': major_sites['other'],
    'Prostate gland': major_sites['prostate'],
    'Uterus, NOS': major_sites['uterus'],
    'Liver and intrahepatic bile ducts': major_sites['liver'],
    'Connective, subcutaneous and other soft tissues': major_sites['soft_tissue'],
    'Thyroid gland': major_sites['thyroid'],
    'Rectum': major_sites['colorectoral'],
    'Other and ill-defined sites': major_sites['other'],
    'Corpus uteri': major_sites['uterus'],
    'Other and ill-defined digestive organs': major_sites['other'],
    'Heart, mediastinum, and pleura': major_sites['other'],
    'Cervix uteri': major_sites['cervix'],
    'Other and unspecified major salivary glands': major_sites['other'],
    'Lymph Nodes': major_sites['lymph_nodes'],
    'Bones, joints and articular cartilage of other and unspecified sites': major_sites['bone'],
    'Retroperitoneum and peritoneum': major_sites['other'], # abdominal
    'Other and ill-defined sites in lip, oral cavity and pharynx': major_sites['head_and_neck'],
    'Peripheral nerves and autonomic nervous system': major_sites['nervous_system'],
    'Bones, joints and articular cartilage of limbs': major_sites['bone'],
    'Small intestine': major_sites['other'],
    'Gallbladder': major_sites['other'],
    'Meninges': major_sites['other'],
    'Not Reported': major_sites['other'],
    'Anus and anal canal': major_sites['other'],
    'Eye and adnexa': major_sites['eye'],
    'Other and unspecified parts of biliary tract': major_sites['other'], # liver? bile duct?
    'Other and unspecified urinary organs': major_sites['other'],
    'Oropharynx': major_sites['head_and_neck'],
    'Other endocrine glands and related structures': major_sites['other'],
    'Larynx': major_sites['head_and_neck'],
    'Other and unspecified female genital organs': major_sites['other'],
    'Other and unspecified parts of tongue': major_sites['head_and_neck'],
    'Nasopharynx': major_sites['head_and_neck'],
    'Rectosigmoid junction': major_sites['colorectoral'],
    'Vagina': major_sites['other'],
    'Floor of mouth': major_sites['head_and_neck'],
    'Tonsil': major_sites['head_and_neck'],
    'Other and unspecified parts of mouth': major_sites['head_and_neck'],
    'Nasal cavity and middle ear': major_sites['head_and_neck'],
    'Penis': major_sites['other'],
    'Hypopharynx': major_sites['head_and_neck'],
    'Base of tongue': major_sites['head_and_neck'],
    'Ureter': major_sites['other'],
    'Gum': major_sites['head_and_neck'],
    'Vulva': major_sites['other'],
    'Lip': major_sites['other'],
    'Trachea': major_sites['other'],
    'Palate': major_sites['other'],
    'Other and unspecified male genital organs': major_sites['other'],
    'Renal pelvis': major_sites['kidney']
}

In [70]:
def convert_to_major(value):
    if value in sites_dict:
        return sites_dict[value]
    else:
        return value

In [72]:
df['major_site'] = df['primary_site'].apply(convert_to_major)

In [75]:
df.loc[~df['major_site'].isin(major_sites.values())]

Unnamed: 0,disease_type,primary_site,gender,race,vital_status,ethnicity,days_to_death,days_to_birth,year_of_birth,cause_of_death,year_of_diagnosis,age_at_diagnosis,days_to_last_follow_up,tumor_grade,days_to_recurrence,prior_malignancy,major_site
