In [1]:
import numpy as np
import pandas as pd

In [2]:
tumors_oslo = pd.read_csv('/data/projects/TMOR/num_tumors_oslo.csv')
tumors_standford = pd.read_csv('/data/projects/TMOR/num_tumors_standford.csv')
oslo = pd.read_csv("/data/projects/TMOR/OsloMetsDemographics.csv")
standford = pd.read_csv("/data/projects/TMOR/stanford_labels.csv")

## Oslo Data

In [3]:
tumors_oslo

Unnamed: 0,SubjectNumber,Number_tumors
0,Subject01,2
1,Subject02,6
2,Subject03,4
3,Subject04,1
4,Subject05,2
...,...,...
62,Subject63,4
63,Subject64,2
64,Subject65,4
65,Subject66,1


In [4]:
tumors_oslo.dtypes

SubjectNumber    object
Number_tumors     int64
dtype: object

In [5]:
tumors_oslo[tumors_oslo['Number_tumors'].isna() == True] ## check for any possible missing values

Unnamed: 0,SubjectNumber,Number_tumors


In [6]:
oslo

Unnamed: 0,SubjectNumber,StudyID,Gender,Age,PrimaryCancer,# mets,ROI status,Pre treatment,Number of BMs
0,Subject01,,Male,58.0,mal.mel,2,,"WB, SRS",2.0
1,Subject02,,Female,50.0,ca.pulm,7,,WB,6.0
2,Subject03,,Male,64.0,mal.mel,4,,"WB,SRS",4.0
3,Subject04,,Male,56.0,mal.mel,1,,No,1.0
4,Subject05,,Female,54.0,mal.mel,1,?,"SRS,PO",2.0
...,...,...,...,...,...,...,...,...,...
67,,,,,,,,,
68,,ROI complete,,,,,,,
69,,Missing ROI outside VAI slab,,,,,,,
70,,Missing all ROIs,,,,,,,


In [7]:
oslo.dtypes

SubjectNumber     object
StudyID           object
Gender            object
Age              float64
PrimaryCancer     object
# mets            object
ROI status        object
Pre treatment     object
Number of BMs    float64
dtype: object

In [8]:
oslo[oslo['Gender'].isna() == True]

Unnamed: 0,SubjectNumber,StudyID,Gender,Age,PrimaryCancer,# mets,ROI status,Pre treatment,Number of BMs
66,Subject67,,,,,?,,,3.0
67,,,,,,,,,
68,,ROI complete,,,,,,,
69,,Missing ROI outside VAI slab,,,,,,,
70,,Missing all ROIs,,,,,,,
71,,Yet to be included,,,,,,,


In [9]:
oslo[oslo['Age'].isna() == True]

Unnamed: 0,SubjectNumber,StudyID,Gender,Age,PrimaryCancer,# mets,ROI status,Pre treatment,Number of BMs
66,Subject67,,,,,?,,,3.0
67,,,,,,,,,
68,,ROI complete,,,,,,,
69,,Missing ROI outside VAI slab,,,,,,,
70,,Missing all ROIs,,,,,,,
71,,Yet to be included,,,,,,,


In [10]:
oslo = oslo.iloc[:66] # we are missing info on sex and age of the Subject 67. Onwards that patient we cannot compute any other value since there are not subjects

In [11]:
oslo['Age'] = oslo['Age'].astype(int)
oslo['Number of BMs'] = oslo['Number of BMs'].astype(int)
oslo.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  oslo['Age'] = oslo['Age'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  oslo['Number of BMs'] = oslo['Number of BMs'].astype(int)


SubjectNumber    object
StudyID          object
Gender           object
Age               int64
PrimaryCancer    object
# mets           object
ROI status       object
Pre treatment    object
Number of BMs     int64
dtype: object

In [12]:
oslo_labels = pd.merge(oslo, tumors_oslo, how="left", on=["SubjectNumber"])
oslo_labels

Unnamed: 0,SubjectNumber,StudyID,Gender,Age,PrimaryCancer,# mets,ROI status,Pre treatment,Number of BMs,Number_tumors
0,Subject01,,Male,58,mal.mel,2,,"WB, SRS",2,2.0
1,Subject02,,Female,50,ca.pulm,7,,WB,6,6.0
2,Subject03,,Male,64,mal.mel,4,,"WB,SRS",4,4.0
3,Subject04,,Male,56,mal.mel,1,,No,1,1.0
4,Subject05,,Female,54,mal.mel,1,?,"SRS,PO",2,2.0
...,...,...,...,...,...,...,...,...,...,...
61,Subject62,,Male,77,ca.pulm,?,,,1,1.0
62,Subject63,,Male,32,mal.mel,?,,,4,4.0
63,Subject64,,Female,63,ca.pulm,?,,,2,2.0
64,Subject65,,Female,65,ca.pulm,?,,,4,4.0


In [13]:
oslo_labels.dtypes

SubjectNumber     object
StudyID           object
Gender            object
Age                int64
PrimaryCancer     object
# mets            object
ROI status        object
Pre treatment     object
Number of BMs      int64
Number_tumors    float64
dtype: object

In [14]:
oslo_labels[oslo_labels['Number_tumors'].isna() == True] #due to problems of the SubjectId for Subject57 and Subject58 therefore we will drop it and so the other columns we dont need

Unnamed: 0,SubjectNumber,StudyID,Gender,Age,PrimaryCancer,# mets,ROI status,Pre treatment,Number of BMs,Number_tumors
56,Subject57,,Female,52,ca.pulm,?,,,1,
57,Subject58,,Male,74,ca.pulm,?,,,1,


In [15]:
oslo_labels = oslo_labels.drop(['Number_tumors','StudyID','# mets', 'ROI status', 'Pre treatment'], axis=1)
oslo_labels

Unnamed: 0,SubjectNumber,Gender,Age,PrimaryCancer,Number of BMs
0,Subject01,Male,58,mal.mel,2
1,Subject02,Female,50,ca.pulm,6
2,Subject03,Male,64,mal.mel,4
3,Subject04,Male,56,mal.mel,1
4,Subject05,Female,54,mal.mel,2
...,...,...,...,...,...
61,Subject62,Male,77,ca.pulm,1
62,Subject63,Male,32,mal.mel,4
63,Subject64,Female,63,ca.pulm,2
64,Subject65,Female,65,ca.pulm,4


## lets create some dummy variables for these table, dummy variables based on the primary cancer site(mal.mel and ca.pulm) and sex (Male and Female)

In [16]:
oslo_labels.loc[oslo_labels['Gender'] == 'Male', 'Gender'] = 'M'
oslo_labels.loc[oslo_labels['Gender'] == 'Female', 'Gender'] = 'F'

oslo_labels.loc[oslo_labels['PrimaryCancer'] == 'mal.mel', 'PrimaryCancer'] = 'melanoma'
oslo_labels.loc[oslo_labels['PrimaryCancer'] == 'ca.pulm', 'PrimaryCancer'] = 'lung'
oslo_labels['Gender_Labels'] = oslo_labels["Gender"]
oslo_labels['PrimaryCancer_Labels'] = oslo_labels["PrimaryCancer"]
oslo_labels

Unnamed: 0,SubjectNumber,Gender,Age,PrimaryCancer,Number of BMs,Gender_Labels,PrimaryCancer_Labels
0,Subject01,M,58,melanoma,2,M,melanoma
1,Subject02,F,50,lung,6,F,lung
2,Subject03,M,64,melanoma,4,M,melanoma
3,Subject04,M,56,melanoma,1,M,melanoma
4,Subject05,F,54,melanoma,2,F,melanoma
...,...,...,...,...,...,...,...
61,Subject62,M,77,lung,1,M,lung
62,Subject63,M,32,melanoma,4,M,melanoma
63,Subject64,F,63,lung,2,F,lung
64,Subject65,F,65,lung,4,F,lung


In [17]:
oslo_labels = pd.get_dummies(oslo_labels, columns=['PrimaryCancer_Labels'])
oslo_labels = pd.get_dummies(oslo_labels, columns=['Gender_Labels'])
oslo_labels['PrimaryCancer_Labels_lung'] = oslo_labels['PrimaryCancer_Labels_lung'].astype(int)
oslo_labels['PrimaryCancer_Labels_melanoma'] = oslo_labels['PrimaryCancer_Labels_melanoma'].astype(int)
oslo_labels['Gender_Labels_F'] = oslo_labels['Gender_Labels_F'].astype(int)
oslo_labels['Gender_Labels_M'] = oslo_labels['Gender_Labels_M'].astype(int)
oslo_labels

Unnamed: 0,SubjectNumber,Gender,Age,PrimaryCancer,Number of BMs,PrimaryCancer_Labels_lung,PrimaryCancer_Labels_melanoma,Gender_Labels_F,Gender_Labels_M
0,Subject01,M,58,melanoma,2,0,1,0,1
1,Subject02,F,50,lung,6,1,0,1,0
2,Subject03,M,64,melanoma,4,0,1,0,1
3,Subject04,M,56,melanoma,1,0,1,0,1
4,Subject05,F,54,melanoma,2,0,1,1,0
...,...,...,...,...,...,...,...,...,...
61,Subject62,M,77,lung,1,1,0,0,1
62,Subject63,M,32,melanoma,4,0,1,0,1
63,Subject64,F,63,lung,2,1,0,1,0
64,Subject65,F,65,lung,4,1,0,1,0


In [18]:
oslo_labels = oslo_labels.rename(columns={'PrimaryCancer_Labels_melanoma': 'Melanoma', 'PrimaryCancer_Labels_lung': 'Lung', 'Gender_Labels_F': 'F', 'Gender_Labels_M': 'M'})

In [19]:
def create_annotations(row):
    annotations = row['PrimaryCancer'].split(',')
    return len(annotations)

oslo_labels['number_annotations'] = oslo_labels.apply(create_annotations, axis=1)
oslo_labels

Unnamed: 0,SubjectNumber,Gender,Age,PrimaryCancer,Number of BMs,Lung,Melanoma,F,M,number_annotations
0,Subject01,M,58,melanoma,2,0,1,0,1,1
1,Subject02,F,50,lung,6,1,0,1,0,1
2,Subject03,M,64,melanoma,4,0,1,0,1,1
3,Subject04,M,56,melanoma,1,0,1,0,1,1
4,Subject05,F,54,melanoma,2,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...
61,Subject62,M,77,lung,1,1,0,0,1,1
62,Subject63,M,32,melanoma,4,0,1,0,1,1
63,Subject64,F,63,lung,2,1,0,1,0,1
64,Subject65,F,65,lung,4,1,0,1,0,1


# Lets try move the columns a bit and create new columns that make up for 'breast' and 'other' in the standford dataset

In [20]:
oslo_labels['other'] = np.zeros(len(oslo_labels))
oslo_labels['breast'] = np.zeros(len(oslo_labels))
oslo_labels['other'] = oslo_labels['other'].astype(int)
oslo_labels['breast'] = oslo_labels['breast'].astype(int)
oslo_labels

Unnamed: 0,SubjectNumber,Gender,Age,PrimaryCancer,Number of BMs,Lung,Melanoma,F,M,number_annotations,other,breast
0,Subject01,M,58,melanoma,2,0,1,0,1,1,0,0
1,Subject02,F,50,lung,6,1,0,1,0,1,0,0
2,Subject03,M,64,melanoma,4,0,1,0,1,1,0,0
3,Subject04,M,56,melanoma,1,0,1,0,1,1,0,0
4,Subject05,F,54,melanoma,2,0,1,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
61,Subject62,M,77,lung,1,1,0,0,1,1,0,0
62,Subject63,M,32,melanoma,4,0,1,0,1,1,0,0
63,Subject64,F,63,lung,2,1,0,1,0,1,0,0
64,Subject65,F,65,lung,4,1,0,1,0,1,0,0


In [21]:
temp_cols = oslo_labels.columns.tolist()
index_numberofbms = oslo_labels.columns.get_loc('Number of BMs') # obtained index from the column 'NUmber of BMs'
index_F = oslo_labels.columns.get_loc('F')
index_M = oslo_labels.columns.get_loc('M')
index_Gender = oslo_labels.columns.get_loc('Gender')
index_primcancer = oslo_labels.columns.get_loc('PrimaryCancer')
index_numannotations = oslo_labels.columns.get_loc('number_annotations')
index_melanoma = oslo_labels.columns.get_loc('Melanoma')
index_age = oslo_labels.columns.get_loc('Age')
index_other = oslo_labels.columns.get_loc('other')
index_breast = oslo_labels.columns.get_loc('breast')
index_lung = oslo_labels.columns.get_loc('Lung')


new_cols = temp_cols[0:1] + temp_cols[index_numberofbms:index_numberofbms + 1]  + temp_cols[index_age:index_age + 1] + temp_cols[index_numannotations:index_numannotations + 1] + temp_cols[index_Gender:index_Gender + 1] + temp_cols[index_F:index_M + 1] + temp_cols[index_primcancer:index_primcancer+1] + temp_cols[index_breast:index_breast+1] + temp_cols[index_lung:index_lung+1] + temp_cols[index_melanoma:index_melanoma+1] + temp_cols[index_other:index_other+1]

new_cols

['SubjectNumber',
 'Number of BMs',
 'Age',
 'number_annotations',
 'Gender',
 'F',
 'M',
 'PrimaryCancer',
 'breast',
 'Lung',
 'Melanoma',
 'other']

In [22]:
oslo_labels = oslo_labels[new_cols]
oslo_labels

Unnamed: 0,SubjectNumber,Number of BMs,Age,number_annotations,Gender,F,M,PrimaryCancer,breast,Lung,Melanoma,other
0,Subject01,2,58,1,M,0,1,melanoma,0,0,1,0
1,Subject02,6,50,1,F,1,0,lung,0,1,0,0
2,Subject03,4,64,1,M,0,1,melanoma,0,0,1,0
3,Subject04,1,56,1,M,0,1,melanoma,0,0,1,0
4,Subject05,2,54,1,F,1,0,melanoma,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
61,Subject62,1,77,1,M,0,1,lung,0,1,0,0
62,Subject63,4,32,1,M,0,1,melanoma,0,0,1,0
63,Subject64,2,63,1,F,1,0,lung,0,1,0,0
64,Subject65,4,65,1,F,1,0,lung,0,1,0,0


In [23]:
oslo_labels = oslo_labels.rename(columns={'Number of BMs': 'Number_tumors', 'Gender': 'Sex', 'PrimaryCancer': 'labels', 'Lung': 'lung', 'Melanoma': 'melanoma'})
oslo_labels

Unnamed: 0,SubjectNumber,Number_tumors,Age,number_annotations,Sex,F,M,labels,breast,lung,melanoma,other
0,Subject01,2,58,1,M,0,1,melanoma,0,0,1,0
1,Subject02,6,50,1,F,1,0,lung,0,1,0,0
2,Subject03,4,64,1,M,0,1,melanoma,0,0,1,0
3,Subject04,1,56,1,M,0,1,melanoma,0,0,1,0
4,Subject05,2,54,1,F,1,0,melanoma,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
61,Subject62,1,77,1,M,0,1,lung,0,1,0,0
62,Subject63,4,32,1,M,0,1,melanoma,0,0,1,0
63,Subject64,2,63,1,F,1,0,lung,0,1,0,0
64,Subject65,4,65,1,F,1,0,lung,0,1,0,0


In [24]:
oslo_labels.to_excel(r'oslo_labels.xlsx', index=False)

## Standford data set clean and creation  of dummy variables

In [3]:
tumors_standford

Unnamed: 0,Patient_ID,Number_tumors
0,5,3
1,9,13
2,10,3
3,11,3
4,13,2
...,...,...
151,321,5
152,323,23
153,326,1
154,328,16


In [4]:
tumors_standford.dtypes

Patient_ID       int64
Number_tumors    int64
dtype: object

In [5]:
len(tumors_standford[tumors_standford['Number_tumors'].isna() == True]) #we have no missing values fro the number of tumors

0

In [6]:
standford

Unnamed: 0,Patient_ID,"Scanner Type (GE, Emeryville, AMC)",Scanner Strength (T),Age,Sex,Primary cancer type,Number of tumors
0,5,GE,15,74,M,NSCLC,3
1,9,GE,3,29,F,breast ca,13
2,10,GE,3,69,F,NSCLC,3
3,11,GE,15,36,F,breast ca,3
4,13,GE,3,56,F,NSCLC,2
...,...,...,...,...,...,...,...
151,321,,,56,M,calvarial metastases,Number of tumors
152,323,,,72,M,non small cell lung carcinoma,Number of tumors
153,326,,,50,M,breast cancer,Number of tumors
154,328,,,53,M,lung cancer,Number of tumors


In [7]:
standford.dtypes

Patient_ID                             int64
Scanner Type (GE, Emeryville, AMC)    object
Scanner Strength (T)                  object
Age                                    int64
Sex                                   object
Primary cancer type                   object
Number of tumors                      object
dtype: object

In [30]:
len(standford[standford['Age'].isna() == True]) #no missing values for age

0

In [31]:
len(standford[standford['Sex'].isna() == True]) #no missing values for sex

0

In [32]:
labels_sex = ['F', 'M']
len(standford[standford['Sex'].isin(labels_sex) == False]) #there are no other values than the ones expected

0

In [33]:
standford['label_sex'] = standford['Sex']

In [34]:
standford = standford.drop(['Scanner Type (GE, Emeryville, AMC)', 'Scanner Strength (T)', 'Number of tumors'], axis = 1)
standford = pd.merge(standford, tumors_standford, how="left", on=["Patient_ID"])
standford

Unnamed: 0,Patient_ID,Age,Sex,Primary cancer type,label_sex,Number_tumors
0,5,74,M,NSCLC,M,3
1,9,29,F,breast ca,F,13
2,10,69,F,NSCLC,F,3
3,11,36,F,breast ca,F,3
4,13,56,F,NSCLC,F,2
...,...,...,...,...,...,...
151,321,56,M,calvarial metastases,M,5
152,323,72,M,non small cell lung carcinoma,M,23
153,326,50,M,breast cancer,M,1
154,328,53,M,lung cancer,M,16


## Clean Standford dataset Primary cancer type column

In [35]:
standford.loc[standford['Primary cancer type'] == 'breast ca', 'Primary cancer type'] = 'breast_cancer'
standford.loc[standford['Primary cancer type'] == 'Breast ca', 'Primary cancer type'] = 'breast_cancer'
standford.loc[standford['Primary cancer type'] == 'small cell lung cancer', 'Primary cancer type'] = 'SCLC'
standford.loc[standford['Primary cancer type'] == 'squamous cell lung cancer', 'Primary cancer type'] = 'NSCLC'
standford.loc[standford['Primary cancer type'] == 'breast cancer', 'Primary cancer type'] = 'breast_cancer'
standford.loc[standford['Primary cancer type'] == 'Melanoma', 'Primary cancer type'] = 'melanoma'
standford.loc[standford['Primary cancer type'] == 'kidney ca', 'Primary cancer type'] = 'kidney'
standford.loc[standford['Primary cancer type'] == 'lung adenocarcinoma', 'Primary cancer type'] = 'NSCLC'
standford.loc[standford['Primary cancer type'] == 'Lung adenocarcinoma', 'Primary cancer type'] = 'NSCLC'
standford.loc[standford['Primary cancer type'] == 'uterine cancer', 'Primary cancer type'] = 'uterus'
standford.loc[standford['Primary cancer type'] == 'metastatic melanoma', 'Primary cancer type'] = 'melanoma'
standford.loc[standford['Primary cancer type'] == 'brain metastatic lesions', 'Primary cancer type'] = 'brain'
standford.loc[standford['Primary cancer type'] == 'metastatic NSCLC', 'Primary cancer type'] = 'NSCLC'
standford.loc[standford['Primary cancer type'] == 'brain metastases', 'Primary cancer type'] = 'brain'
standford.loc[standford['Primary cancer type'] == 'brain metastastic lesions', 'Primary cancer type'] = 'brain'
standford.loc[standford['Primary cancer type'] == 'brain metastatic lesions', 'Primary cancer type'] = 'brain'
standford.loc[standford['Primary cancer type'] == 'metastatic lung cancer', 'Primary cancer type'] = 'lung_cancer'
standford.loc[standford['Primary cancer type'] == 'lung cancer', 'Primary cancer type'] = 'lung_cancer'
standford.loc[standford['Primary cancer type'] == 'metastatic breast cancer', 'Primary cancer type'] = 'breast_cancer'
standford.loc[standford['Primary cancer type'] == 'non small cell lung carcinoma', 'Primary cancer type'] = 'NSCLC'
standford.loc[standford['Primary cancer type'] == 'Colon ca', 'Primary cancer type'] = 'colon'
standford.loc[standford['Primary cancer type'] == 'scc lungs', 'Primary cancer type'] = 'NSCLC'
standford.loc[standford['Primary cancer type'] == 'intercranial lung mets', 'Primary cancer type'] = 'lung_cancer'
standford.loc[standford['Primary cancer type'] == 'frontal lobe lung metastases', 'Primary cancer type'] = 'lung_cancer'
standford.loc[standford['Primary cancer type'] == 'squamous cell carcinoma of lung', 'Primary cancer type'] = 'lung_cancer'
standford.loc[standford['Primary cancer type'] == 'breast_cancer', 'Primary cancer type'] = 'breast'
standford.loc[standford['Primary cancer type'] == 'f/u multiple brain metastases', 'Primary cancer type'] = 'brain'
standford.loc[standford['Primary cancer type'] == 'lung_cancer', 'Primary cancer type'] = 'lung'
standford.loc[standford['Primary cancer type'] == 'NSCLC', 'Primary cancer type'] = 'lung'
standford.loc[standford['Primary cancer type'] == 'SCLC', 'Primary cancer type'] = 'lung'


In [36]:
#patients 297 (index 138) does not have a label nor someething similar. We are going to have delete this one patient
standford.iloc[138, :]

Patient_ID                   298
Age                           54
Sex                            M
Primary cancer type    COME BACK
label_sex                      M
Number_tumors                  3
Name: 138, dtype: object

In [37]:
standford = standford.drop(labels=138, axis = 0)

In [38]:
standford

Unnamed: 0,Patient_ID,Age,Sex,Primary cancer type,label_sex,Number_tumors
0,5,74,M,lung,M,3
1,9,29,F,breast,F,13
2,10,69,F,lung,F,3
3,11,36,F,breast,F,3
4,13,56,F,lung,F,2
...,...,...,...,...,...,...
151,321,56,M,calvarial metastases,M,5
152,323,72,M,lung,M,23
153,326,50,M,breast,M,1
154,328,53,M,lung,M,16


## we would like to know the number of annotations made for primary cancer type, if more than 1 it will be on the test set

In [39]:
def create_annotations(row):
    annotations = row['Primary cancer type'].split(',')
    return len(annotations)

standford['number_annotations'] = standford.apply(create_annotations, axis=1)
standford

Unnamed: 0,Patient_ID,Age,Sex,Primary cancer type,label_sex,Number_tumors,number_annotations
0,5,74,M,lung,M,3,1
1,9,29,F,breast,F,13,1
2,10,69,F,lung,F,3,1
3,11,36,F,breast,F,3,1
4,13,56,F,lung,F,2,1
...,...,...,...,...,...,...,...
151,321,56,M,calvarial metastases,M,5,1
152,323,72,M,lung,M,23,1
153,326,50,M,breast,M,1,1
154,328,53,M,lung,M,16,1


## create a new column with the same labels from Primary cancer type

In [40]:
# Create a list of values to check for
cancer_list = ['breast', 'lung', 'melanoma']

# Define a function to create the labels column
def create_labels(row):
    if row['Primary cancer type'] in cancer_list:
        return row['Primary cancer type']
    else:
        return 'other'

# Apply the function to create the labels column
standford['labels'] = standford.apply(create_labels, axis=1)
standford

Unnamed: 0,Patient_ID,Age,Sex,Primary cancer type,label_sex,Number_tumors,number_annotations,labels
0,5,74,M,lung,M,3,1,lung
1,9,29,F,breast,F,13,1,breast
2,10,69,F,lung,F,3,1,lung
3,11,36,F,breast,F,3,1,breast
4,13,56,F,lung,F,2,1,lung
...,...,...,...,...,...,...,...,...
151,321,56,M,calvarial metastases,M,5,1,other
152,323,72,M,lung,M,23,1,lung
153,326,50,M,breast,M,1,1,breast
154,328,53,M,lung,M,16,1,lung


In [41]:
standford['Labels'] = standford['labels']
standford

Unnamed: 0,Patient_ID,Age,Sex,Primary cancer type,label_sex,Number_tumors,number_annotations,labels,Labels
0,5,74,M,lung,M,3,1,lung,lung
1,9,29,F,breast,F,13,1,breast,breast
2,10,69,F,lung,F,3,1,lung,lung
3,11,36,F,breast,F,3,1,breast,breast
4,13,56,F,lung,F,2,1,lung,lung
...,...,...,...,...,...,...,...,...,...
151,321,56,M,calvarial metastases,M,5,1,other,other
152,323,72,M,lung,M,23,1,lung,lung
153,326,50,M,breast,M,1,1,breast,breast
154,328,53,M,lung,M,16,1,lung,lung


In [42]:
standford_dummy = pd.get_dummies(standford, columns=['Labels'])
standford_dummy['Labels_breast'] = standford_dummy['Labels_breast'].astype(int)
standford_dummy['Labels_lung'] = standford_dummy['Labels_lung'].astype(int)
standford_dummy['Labels_melanoma'] = standford_dummy['Labels_melanoma'].astype(int)
standford_dummy['Labels_other'] = standford_dummy['Labels_other'].astype(int)
standford_dummy = standford_dummy.rename(columns={'Labels_other': 'other', 'Labels_melanoma': 'melanoma', 'Labels_lung': 'lung', 'Labels_breast': 'breast'})
standford_dummy

Unnamed: 0,Patient_ID,Age,Sex,Primary cancer type,label_sex,Number_tumors,number_annotations,labels,breast,lung,melanoma,other
0,5,74,M,lung,M,3,1,lung,0,1,0,0
1,9,29,F,breast,F,13,1,breast,1,0,0,0
2,10,69,F,lung,F,3,1,lung,0,1,0,0
3,11,36,F,breast,F,3,1,breast,1,0,0,0
4,13,56,F,lung,F,2,1,lung,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
151,321,56,M,calvarial metastases,M,5,1,other,0,0,0,1
152,323,72,M,lung,M,23,1,lung,0,1,0,0
153,326,50,M,breast,M,1,1,breast,1,0,0,0
154,328,53,M,lung,M,16,1,lung,0,1,0,0


In [43]:
standford_dummy = pd.get_dummies(standford_dummy, columns=['label_sex'])
standford_dummy['label_sex_F'] = standford_dummy['label_sex_F'].astype(int)
standford_dummy['label_sex_M'] = standford_dummy['label_sex_M'].astype(int)

In [44]:
standford_dummy.rename(columns={'label_sex_F': 'F', 'label_sex_M': 'M'})
standford_dummy

Unnamed: 0,Patient_ID,Age,Sex,Primary cancer type,Number_tumors,number_annotations,labels,breast,lung,melanoma,other,label_sex_F,label_sex_M
0,5,74,M,lung,3,1,lung,0,1,0,0,0,1
1,9,29,F,breast,13,1,breast,1,0,0,0,1,0
2,10,69,F,lung,3,1,lung,0,1,0,0,1,0
3,11,36,F,breast,3,1,breast,1,0,0,0,1,0
4,13,56,F,lung,2,1,lung,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
151,321,56,M,calvarial metastases,5,1,other,0,0,0,1,0,1
152,323,72,M,lung,23,1,lung,0,1,0,0,0,1
153,326,50,M,breast,1,1,breast,1,0,0,0,0,1
154,328,53,M,lung,16,1,lung,0,1,0,0,0,1


In [45]:
standford_dummy = standford_dummy.drop('Primary cancer type', axis = 1)

In [46]:
standford_dummy

Unnamed: 0,Patient_ID,Age,Sex,Number_tumors,number_annotations,labels,breast,lung,melanoma,other,label_sex_F,label_sex_M
0,5,74,M,3,1,lung,0,1,0,0,0,1
1,9,29,F,13,1,breast,1,0,0,0,1,0
2,10,69,F,3,1,lung,0,1,0,0,1,0
3,11,36,F,3,1,breast,1,0,0,0,1,0
4,13,56,F,2,1,lung,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
151,321,56,M,5,1,other,0,0,0,1,0,1
152,323,72,M,23,1,lung,0,1,0,0,0,1
153,326,50,M,1,1,breast,1,0,0,0,0,1
154,328,53,M,16,1,lung,0,1,0,0,0,1


In [47]:
temp_cols = standford_dummy.columns.tolist()
index_F = standford_dummy.columns.get_loc('label_sex_F') #
index_sex = standford_dummy.columns.get_loc('Sex') #
index_labels = standford_dummy.columns.get_loc('labels') #
index_F = standford_dummy.columns.get_loc('label_sex_F')
index_age = standford_dummy.columns.get_loc('Age')
index_nrtumors = standford_dummy.columns.get_loc('Number_tumors')
index_annotations = standford_dummy.columns.get_loc('number_annotations')

new_cols = temp_cols[0:1] +  temp_cols[index_nrtumors:index_nrtumors + 1] +temp_cols[1:index_age + 1] + temp_cols[index_annotations:index_annotations+1] + temp_cols[index_sex:index_sex+1] + temp_cols[index_F:index_F+2] + temp_cols[index_labels:index_labels+5]
#new_cols = temp_cols[0:index_sex + 1] + temp_cols[index_F:] + temp_cols[index_sex + 1: index_F] #Moved the columns 'F' and 'M' close to gender and 'Lung and 'Melanomma at the end'
new_cols

['Patient_ID',
 'Number_tumors',
 'Age',
 'number_annotations',
 'Sex',
 'label_sex_F',
 'label_sex_M',
 'labels',
 'breast',
 'lung',
 'melanoma',
 'other']

In [48]:
standford_dummy = standford_dummy[new_cols]
standford_dummy = standford_dummy.rename(columns ={'label_sex_F':'F', 'label_sex_M':'M'})

standford_dummy

Unnamed: 0,Patient_ID,Number_tumors,Age,number_annotations,Sex,F,M,labels,breast,lung,melanoma,other
0,5,3,74,1,M,0,1,lung,0,1,0,0
1,9,13,29,1,F,1,0,breast,1,0,0,0
2,10,3,69,1,F,1,0,lung,0,1,0,0
3,11,3,36,1,F,1,0,breast,1,0,0,0
4,13,2,56,1,F,1,0,lung,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
151,321,5,56,1,M,0,1,other,0,0,0,1
152,323,23,72,1,M,0,1,lung,0,1,0,0
153,326,1,50,1,M,0,1,breast,1,0,0,0
154,328,16,53,1,M,0,1,lung,0,1,0,0


In [49]:
standford_dummy.to_excel(r'standford_labels.xlsx', index=False)

## Downloaded this dataframe to a xlsx format, remanes as labels_oslo and labels_standford that can be found in: /data/projects/TMOR/Final_labels/labels_oslo.xlsx so later I can open it and join it with the radiomic features by the patient number