## Linear Regression Neural Network Model Implementation Colorectal Adenocarcinoma

#### Initializing Libraries and Importing Packages

In [108]:
import pandas as pd
import numpy as py
import keras
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, median_absolute_error # Performance metrics

#### Importing Colorectal Patient Dataset

In [109]:
# set our dataset equal to ds variable
ds = pd.read_csv('Colorectal Dataset Modified Garik Kazanjian.csv')

# view first 5 lines of dataset
ds.head()

Unnamed: 0,Patient ID,Diagnosis Age,Neoplasm Disease Stage American Joint Committee on Cancer Code,American Joint Committee on Cancer Publication Version Type,Aneuploidy Score,Buffa Hypoxia Score,TCGA PanCanAtlas Cancer Type Acronym,Cancer Type Detailed,Last Communication Contact from Initial Pathologic Diagnosis Date,Birth from Initial Pathologic Diagnosis Date,...,Ragnum Hypoxia Score,Sex,Tissue Prospective Collection Indicator,Tissue Retrospective Collection Indicator,Tissue Source Site Code,TMB (nonsynonymous),Tumor Disease Anatomic Site,Tumor Type,Patient Weight,Medicinal Treatment
0,TCGA-A6-2671,85,STAGE IV,6TH,25.0,,COAD,Colon Adenocarcinoma,1126.0,-31329.0,...,,Male,Yes,No,A6,4.066667,Colon,Colon Adenocarcinoma,67.2,Fluorouacil
1,TCGA-A6-2674,71,STAGE IV,6TH,24.0,,COAD,Mucinous Adenocarcinoma of the Colon and Rectum,1331.0,-26292.0,...,,Male,Yes,No,A6,7.666667,Colon,"Colon Adenocarcinoma, Mucinous Type",85.6,Fluorouacil
2,TCGA-A6-2676,75,STAGE IIB,6TH,1.0,,COAD,Colon Adenocarcinoma,711.0,-27403.0,...,,Female,Yes,No,A6,,Colon,Colon Adenocarcinoma,45.9,Fluorouacil
3,TCGA-A6-2677,68,STAGE IIIC,6TH,8.0,,COAD,Colon Adenocarcinoma,541.0,-25143.0,...,,Female,Yes,No,A6,7.766667,Colon,Colon Adenocarcinoma,55.2,Fluorouacil
4,TCGA-A6-2678,43,STAGE IIIB,6TH,7.0,,COAD,Colon Adenocarcinoma,1286.0,-16030.0,...,,Female,Yes,No,A6,1.733333,Colon,Colon Adenocarcinoma,96.1,Fluorouacil


#### Dataset Cleaning

In [110]:
# dropping all columns which are not included in our feature set
columns_drop = ['Patient ID', 'Oncotree Code', 'Progression Free Status', 'Tumor Type', 'Sex', 'Informed consent verified', 'Primary Lymph Node Presentation Assessment', 'International Classification of Diseases for Oncology, Third Edition ICD-O-3 Site Code', 'American Joint Committee on Cancer Publication Version Type', 'Cancer Type Detailed', 'International Classification of Diseases for Oncology, Third Edition ICD-O-3 Histology Code']
ds = ds.drop(columns=columns_drop)

# assigning each 'Cancer Stage Code'; COAD (colon/colon and rectum) 0; READ (rectum) 1
stage_dict = {
    '0:DiseaseFree': '0',
    '1:Recurred/Progressed': '1'}

# map each respective treatment with its number
ds['Disease Free Status'] = ds['Disease Free Status'].map(stage_dict)

# assigning each 'Cancer Stage Code'; COAD (colon/colon and rectum) 0; READ (rectum) 1
survival_dict = {
    '1:DEAD WITH TUMOR': '1',
    '0:ALIVE OR DEAD TUMOR FREE': '0'}

# map each respective treatment with its number
ds['Disease-specific Survival status'] = ds['Disease-specific Survival status'].map(survival_dict)


# assigning each 'Cancer Stage Code'; COAD (colon/colon and rectum) 0; READ (rectum) 1
gen_dict = {
    'EUR': '1',
    'EUR_ADMIX': '2',
    'AFR': '3',
    'AFR_ADMIX': '4'}

# map each respective treatment with its number
ds['Genetic Ancestry Label'] = ds['Genetic Ancestry Label'].map(gen_dict)




# assigning each 'Cancer Stage Code'; COAD (colon/colon and rectum) 0; READ (rectum) 1
acronym_dict = {
    'COAD': '0',
    'READ': '1'}

# map each respective treatment with its number
ds['TCGA PanCanAtlas Cancer Type Acronym'] = ds['TCGA PanCanAtlas Cancer Type Acronym'].map(acronym_dict)

# assigning each 'Cancer Stage Code' with an appropriate number according to its stage
stage_dict = {
    'STAGE I': '1',
    'STAGE II': '2',
    'STAGE IIA': '2.25',
    'STAGE IIB': '2.50',
    'STAGE IIC': '2.75',
    'STAGE III': '3',
    'STAGE IIIA': '3.25',
    'STAGE IIIB': '3.50',
    'STAGE IIIC': '3.75',
    'STAGE IV': '4',
    'STAGE IVA': '4.25',
    'STAGE IVB': '4.75',
    'NA': '0'}

# map each respective treatment with its number
ds['Neoplasm Disease Stage American Joint Committee on Cancer Code'] = ds['Medicinal Treatment'].map(stage_dict)

# assigning each treatment with an appropriate number according to its position within the column
treatment_dict = {
    'Fluorouacil': '1',
    'Leucovorin': '2',
    'Oxaliplatin': '3',
    'Bevacizumab': '4',
    'Radiation 1': '5'}

# map each respective treatment with its number
ds['Medicinal Treatment'] = ds['Medicinal Treatment'].map(treatment_dict)



# assigning each treatment with an appropriate number according to its position within the column
icd_dict = {
    'C80.1': '80.1',
    'C49.4': '49.4',
    'C20': '20',
    'C19': '19',
    'C18.9': '18.9',
    'C18.7': '18.7',
    'C18.6': '18.6',
    'C18.5': '18.5',
    'C18.4': '18.4',
    'C18.3': '18.3',
    'C18.2': '18.2',
    'C18.0': '18.0'}

# map each respective treatment with its number
ds['ICD-10 Classification'] = ds['ICD-10 Classification'].map(icd_dict)


# International Classification of Diseases for Oncology, Third Edition ICD-O-3 Histology Code

# assigning each treatment with an appropriate number according to its position within the column
hist_dict = {
    'C80.1': '80.1',
    'C49.4': '49.4',
    'C20': '20',
    'C19': '19',
    'C18.9': '18.9',
    'C18.7': '18.7',
    'C18.6': '18.6',
    'C18.5': '18.5',
    'C18.4': '18.4',
    'C18.3': '18.3',
    'C18.2': '18.2',
    'C18.0': '18.0'}

# map each respective treatment with its number
ds['ICD-10 Classification'] = ds['ICD-10 Classification'].map(hist_dict)




# assigning each treatment with an appropriate number according to its position within the column
neo_dict = {
    'Yes': '1',
    'No': '0'}

# map each respective treatment with its number
ds['New Neoplasm Event Post Initial Therapy Indicator'] = ds['New Neoplasm Event Post Initial Therapy Indicator'].map(neo_dict)

#Tissue Prospective Collection Indicator
# assigning each treatment with an appropriate number according to its position within the column
tiss_dict = {
    'Yes': '1',
    'No': '0'}

# map each respective treatment with its number
ds['Tissue Prospective Collection Indicator'] = ds['Tissue Prospective Collection Indicator'].map(tiss_dict)

# assigning each treatment with an appropriate number according to its position within the column
retro_dict = {
    'Yes': '1',
    'No': '0'}

# map each respective treatment with its number
ds['Tissue Retrospective Collection Indicator'] = ds['Tissue Retrospective Collection Indicator'].map(retro_dict)

# assigning each treatment with an appropriate number according to its position within the column
live_dict = {
    '1:DECEASED': '1',
    '0:LIVING': '0'}

# map each respective treatment with its number
ds['Overall Survival Status'] = ds['Overall Survival Status'].map(live_dict)




# assigning each treatment with an appropriate number according to its position within the column
mest_dict = {
    'M0': '0',
    'M1': '1',
    'M1A': '1.25',
    'M1B': '1.75',
    'MX': '5'}

# map each respective treatment with its number
ds['American Joint Committee on Cancer Metastasis Stage Code'] = ds['American Joint Committee on Cancer Metastasis Stage Code'].map(mest_dict)


# Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code
# assigning each treatment with an appropriate number according to its position within the column
lymh_dict = {
    'NX': '8',
    'N2B': '7',
    'N2A': '6',
    'N2': '5',
    'N1C': '4',
    'N1B': '3',
    'N1A': '2',
    'N1': '1',}

# map each respective treatment with its number
ds['Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code'] = ds['Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code'].map(lymh_dict)

# assigning each treatment with an appropriate number according to its position within the column
tum_dict = {
    'With Tumor': '0',
    'Tumor Free': '1',
    'NA': '2'}

# map each respective treatment with its number
ds['Person Neoplasm Cancer Status'] = ds['Person Neoplasm Cancer Status'].map(tum_dict)


# Progression Free Status

# assigning each treatment with an appropriate number according to its position within the column
#prog_dict = {
 #   '1:PROGRESSION': '1',
  #  '0:CENSORED': '0'}

# map each respective treatment with its number
#ds['Progression Free Status'] = ds['Progression Free Status'].map(prog_dict)

 # assigning each treatment with an appropriate number according to its position within the column
#sex_dict = {
 #   'Male': '1',
  #  'Female': '0'}

# map each respective treatment with its number
#ds['Sex'] = ds['Sex'].map(sex_dict)

# assigning each treatment with an appropriate number according to its position within the column
code_dict = {
    'G5': '8',
    'F5': '7',
    'El': '6',
    'DY': '5',
    'DT': '4',
    'DC': '3',
    'D5': '2',
    'CM': '1',
    'CL': '7',
    'CA': '6',
    'AZ': '5',
    'AY': '4',
    'AU': '3',
    'AM': '2',
    'AH': '1',
    'AG': '7',
    'AF': '6',
    'AD': '5',
    'AA': '4',
    'A6': '3',
    '5M': '2',
    '4T': '6',
    '4N': '5',
    '3L': '4'}

# map each respective treatment with its number
ds['Tissue Source Site Code'] = ds['Tissue Source Site Code'].map(code_dict)



# Tumor Disease Anatomic Site
ana_dict = {
    'Colon': '1',
    'Rectum': '0'}

# map each respective treatment with its number
ds['Tumor Disease Anatomic Site'] = ds['Tumor Disease Anatomic Site'].map(ana_dict)


# this for loop will iterate throughout the columns, check for empty values, and replace then with a -1
for column in ds:
    ds[column] = ds[column].replace(py.NaN,1)
    
# Set display option to show all columns
#pd.set_option('display.max_columns', None)
#
# Display the DataFrame
#print(ds)
# Attempt to convert all columns to numeric, coercing errors
for col in ds.columns:
    ds[col] = pd.to_numeric(ds[col], errors='ignore')




ds = ds[:-175]
ds.head()


Unnamed: 0,Diagnosis Age,Neoplasm Disease Stage American Joint Committee on Cancer Code,Aneuploidy Score,Buffa Hypoxia Score,TCGA PanCanAtlas Cancer Type Acronym,Last Communication Contact from Initial Pathologic Diagnosis Date,Birth from Initial Pathologic Diagnosis Date,Disease Free (Months),Disease Free Status,Months of disease-specific survival,...,Person Neoplasm Cancer Status,Progress Free Survival (Months),Ragnum Hypoxia Score,Tissue Prospective Collection Indicator,Tissue Retrospective Collection Indicator,Tissue Source Site Code,TMB (nonsynonymous),Tumor Disease Anatomic Site,Patient Weight,Medicinal Treatment
0,85,1,25.0,1.0,0,1126.0,-31329.0,1.0,1,43.758425,...,0,17.588848,1.0,1,0,3,4.066667,1,67.2,1
1,71,1,24.0,1.0,0,1331.0,-26292.0,1.0,1,43.758425,...,1,24.164119,1.0,1,0,3,7.666667,1,85.6,1
2,75,1,1.0,1.0,0,711.0,-27403.0,42.903639,0,42.903639,...,1,42.903639,1.0,1,0,3,1.0,1,45.9,1
3,68,1,8.0,1.0,0,541.0,-25143.0,1.0,1,24.328501,...,1,24.328501,1.0,1,0,3,7.766667,1,55.2,1
4,43,1,7.0,1.0,0,1286.0,-16030.0,42.278989,0,42.278989,...,1,42.278989,1.0,1,0,3,1.733333,1,96.1,1


#### Initializing Training and Testing Set

In [111]:
# setting x equal to all columns with the exception of hospital disposition
X = ds.drop(columns='Medicinal Treatment')
# setting y equal to what we want to preidct (mortality or alive in hospdisposition column
y = ds['Medicinal Treatment']

# dividing up our dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


scaler = StandardScaler()

# Fit the scaler on the X data and transform it
X = scaler.fit_transform(X)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#### Initializing Keras Regression Model

In [112]:
model = Sequential()

#### Building Neural Net Layer

In [113]:
# Input layer
model.add(Dense(64, input_dim=30, activation='relu'))
# Hidden layer 1
model.add(Dense(32, activation='relu'))
# Output Layer
model.add(Dense(1))
# Setting the adam optmizer which will adjust the keras model weights (minimizign loss)
model.compile(optimizer='adam', loss='mean_squared_error')
# Fitting model to our predictor and target features
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 6.3215 - val_loss: 3.4958
Epoch 2/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 2.8671 - val_loss: 1.9925
Epoch 3/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1.5724 - val_loss: 1.6048
Epoch 4/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1.1507 - val_loss: 1.5279
Epoch 5/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1.0709 - val_loss: 1.3209
Epoch 6/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.8446 - val_loss: 1.1825
Epoch 7/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.8560 - val_loss: 1.1036
Epoch 8/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.8095 - val_loss: 1.0388
Epoch 9/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 

<keras.src.callbacks.history.History at 0x2568f0b1cd0>

#### Model Predictions

In [114]:
predictions=model.predict(X_test)
print("Model Predictions: ", predictions)

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
Model Predictions:  [[1.9119883]
 [2.5846665]
 [2.0582337]
 [1.7680407]
 [1.7364266]
 [2.171223 ]
 [2.3227673]
 [2.6337256]
 [1.538228 ]
 [1.686902 ]
 [1.5112247]
 [2.188586 ]
 [1.675847 ]
 [1.6235762]
 [1.8107766]
 [2.0173497]
 [1.8149436]
 [2.3588803]
 [2.170125 ]
 [1.5069705]
 [2.3698745]
 [1.7647066]
 [1.9414153]
 [1.3311085]
 [1.8855844]
 [1.4845078]
 [1.7766235]
 [1.5669954]
 [2.3430104]
 [2.2064698]
 [2.3430104]
 [2.0418534]
 [2.1295147]
 [2.1295147]
 [2.410085 ]
 [1.9468459]
 [2.1542125]
 [1.473166 ]
 [2.171223 ]
 [2.3701162]
 [1.351243 ]
 [1.9989853]
 [1.5090741]
 [2.3641233]
 [2.5747988]
 [1.8149436]
 [2.8474429]
 [1.5756586]
 [2.0418537]
 [1.8445857]
 [1.7680407]
 [1.7893047]
 [2.1262152]
 [1.9562935]
 [2.9651568]
 [2.1262152]
 [1.6821723]
 [1.2659944]
 [2.3747814]
 [1.5709957]
 [2.4384508]
 [1.9662945]
 [1.930963 ]
 [2.1476014]
 [2.0173497]
 [1.6497097]
 [2.517489 ]
 [1.675847 ]
 [1.3254237]
 [2.6344538

#### Model Evaluation

In [115]:
# Recording Mean Squared Error
print('Mean Squared Error: ',  mean_squared_error(y_test, predictions))

# Records R2-Squared Score of our model
print("R2-Squared Score: ", r2_score(y_test, predictions))

# Records Median Absolute Error
print("Median Absolute Error: ", median_absolute_error(y_test, predictions))

# Records Root Mean Squared Error
print("Root Mean Squared Error (RMSE): ", py.sqrt(mean_squared_error(y_test, predictions)))

Mean Squared Error:  1.4337315240758655
R2-Squared Score:  -0.6318667527413879
Median Absolute Error:  1.1262152194976807
Root Mean Squared Error (RMSE):  1.1973852863952628
