In [1]:
# Obtaining the significant regions based on literature

import pandas as pd

temp_list = []

with open("train_call.txt", 'r') as temp:
    for line in temp:
        temp_list.append(line.split())

columns_temp = temp_list[0]
columns_temp = [x.replace("\"", "") for x in columns_temp]

df_train = pd.DataFrame(temp_list[1:], columns=columns_temp)

sig_reg_list = []

with open("significant_regions.csv", 'r') as sig_regs:
    next(sig_regs)
    for line in sig_regs:
        line_temp = line.split(",")
        if line_temp[1][1].isdigit():
            chromosome = int(line_temp[1][0:2])
        else:
            chromosome = int(line_temp[1][0])
        sig_reg_list.append([chromosome, int(line_temp[2].split()[0]), int(line_temp[2].split()[2])])
        
chromosomes = df_train["Chromosome"].values.tolist()
start_regs = df_train["Start"].values.tolist()
end_regs = df_train["End"].values.tolist()

chromosomes = [int(i) for i in chromosomes]
start_regs = [int(i) for i in start_regs]
end_regs = [int(i) for i in end_regs]

sig_labels = []

for region in sig_reg_list:
    for i in range(len(start_regs)):
        if region[0] == chromosomes[i] and region[1] > start_regs[i] and region[1] < end_regs[i]:
#             print("Region = {0} - {1}".format(start_regs[i], end_regs[i]))
#             print("Significant region (from literature) = {0} - {1}\n".format(region[1], region[2]))
            sig_labels.append(i)
    
print("The regions found in the literature were: ", sig_labels)

The regions found in the literature were:  [2223, 1725, 2021, 2075, 2446, 2794, 1583, 361, 2328, 2733, 111, 937, 2210, 1136, 1965, 688, 1908, 1365, 361, 361, 1911, 1386, 1407, 1407, 664, 1575, 625, 479, 1296, 1866, 1735, 993]


In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import chi2
import matplotlib.pyplot as plt

#Loading data
df = pd.read_csv('Processed_data_2.csv')

# X for features Y for breast cancer subtype where 1 = HER2+, 2 = HR+, 3 = Triple Neg
X = df.drop(['Unnamed: 0', 'Sample','Subgroup'], axis=1)
Y = df['Subgroup']

patients = X.values.tolist()
labels = Y.values.tolist()

## FEATURE SELECTION

# we need to shift the features by 1 since the chi2 function does not take non-negative values
patients_shift = []

for patient in patients:
    x = []
    for feature in patient:
        x.append(feature + 1)
    patients_shift.append(x)

chi, p_val = chi2(patients_shift, Y)

X_feature_selected = df

p_value = 0.05
for i, val in enumerate(p_val):
    # this drops the p_values higher than 0.05 that are not in the significant genes list
    if val >= p_value and i not in sig_labels:
        X_feature_selected = X_feature_selected.drop(["V{}".format(i + 1)], axis=1)
        
X = X_feature_selected.drop(['Unnamed: 0', 'Sample','Subgroup'], axis=1)
patients = X.values.tolist()

In [3]:
print(X)
print("Number of features = {}".format(len(patients[0])))

    V112  V193  V362  V480  V626  V665  V669  V670  V671  V672  ...  V2224  \
0     -1     1    -1     0     0     0     0     0     0     0  ...     -1   
1      0    -1     0     0     0     0     0     0     0     0  ...      0   
2      0     0     1     1     1     1     1     1     1     1  ...      0   
3     -1    -1     0     1     0     0     0     0     0     0  ...      0   
4      0    -1     0     0     0     0     0     0     0     0  ...      0   
..   ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...    ...   
95     1    -1     0     1     0     0     0     0     0     0  ...      0   
96     0     2     0     0     0     0     0     0     0     0  ...      0   
97     0     0     0     0     1     1     1     1     1     1  ...      0   
98     0    -1     0     1     1     1     1     1     1     1  ...      0   
99    -1    -1     0    -1     0     0     0     0     0     0  ...      0   

    V2225  V2329  V2447  V2724  V2733  V2734  V2751  V2752  V27

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, accuracy_score, make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

def classification_report_with_accuracy_score(y_true, y_pred):
    "Scorer for the cross validation function"
    originalclass.extend(y_true)
    predictedclass.extend(y_pred)
    return accuracy_score(y_true, y_pred) 

def evaluate_model(model, features, labels):
    "Gets the cross validation score using a 5-fold (cv=5) cross validation"
    scores = cross_val_score(model, features, labels, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
    return scores

In [5]:
# Random Forest

originalclass = []
predictedclass = []

random_forest = RandomForestClassifier(n_estimators=100)
temp = evaluate_model(random_forest, X, Y)
print(classification_report(originalclass, predictedclass)) 

              precision    recall  f1-score   support

           1       0.82      0.88      0.85        32
           2       0.74      0.72      0.73        36
           3       0.68      0.66      0.67        32

    accuracy                           0.75       100
   macro avg       0.75      0.75      0.75       100
weighted avg       0.75      0.75      0.75       100



In [6]:
# Naive Bayes

originalclass = []
predictedclass = []

gnb = GaussianNB()
temp = evaluate_model(gnb, X, Y)
print(classification_report(originalclass, predictedclass))

              precision    recall  f1-score   support

           1       1.00      0.78      0.88        32
           2       0.76      0.81      0.78        36
           3       0.68      0.78      0.72        32

    accuracy                           0.79       100
   macro avg       0.81      0.79      0.80       100
weighted avg       0.81      0.79      0.79       100



In [7]:
# Neural network with 1 hidden layer (the hidden layer is as big as the number of features)

originalclass = []
predictedclass = []

mlp = MLPClassifier(solver='lbfgs', activation="relu", alpha=1e-5,
                    hidden_layer_sizes=(len(patients[0]),), max_iter=1000)

temp = evaluate_model(mlp, X, Y)
print(classification_report(originalclass, predictedclass))

              precision    recall  f1-score   support

           1       0.97      0.91      0.94        32
           2       0.79      0.86      0.83        36
           3       0.84      0.81      0.83        32

    accuracy                           0.86       100
   macro avg       0.87      0.86      0.86       100
weighted avg       0.86      0.86      0.86       100



In [8]:
# Neural network with 2 hidden layer (the hidden layer is as big as the number of features)

originalclass = []
predictedclass = []

mlp = MLPClassifier(solver='lbfgs', activation="relu", alpha=1e-5,
                    hidden_layer_sizes=(len(patients[0]),len(patients[0])), max_iter=1000)

temp = evaluate_model(mlp, X, Y)
print(classification_report(originalclass, predictedclass))

              precision    recall  f1-score   support

           1       0.91      0.94      0.92        32
           2       0.81      0.83      0.82        36
           3       0.87      0.81      0.84        32

    accuracy                           0.86       100
   macro avg       0.86      0.86      0.86       100
weighted avg       0.86      0.86      0.86       100



In [9]:
# Neural network with 3 hidden layer (the hidden layer is as big as the number of features)

originalclass = []
predictedclass = []

mlp = MLPClassifier(solver='lbfgs', activation="relu", alpha=1e-5,
                    hidden_layer_sizes=(len(patients[0]),len(patients[0]),len(patients[0])), max_iter=1000)

temp = evaluate_model(mlp, X, Y)
print(classification_report(originalclass, predictedclass))

              precision    recall  f1-score   support

           1       0.94      0.94      0.94        32
           2       0.79      0.83      0.81        36
           3       0.80      0.75      0.77        32

    accuracy                           0.84       100
   macro avg       0.84      0.84      0.84       100
weighted avg       0.84      0.84      0.84       100

