# Análisis de datos genéticos de cáncer de seno
## Importación de librerías

In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer
from sklearn import metrics

## Carga de datos

In [2]:
df = pd.read_csv('METABRIC_RNA_MUTATION.csv')

  df = pd.read_csv('METABRIC_RNA_MUTATION.csv')


## Encabezado de la tabla

In [3]:
df.head()

Unnamed: 0,patient_id,age_at_diagnosis,type_of_breast_surgery,cancer_type,cancer_type_detailed,cellularity,chemotherapy,pam50_+_claudin-low_subtype,cohort,er_status_measured_by_ihc,...,mtap_mut,ppp2cb_mut,smarcd1_mut,nras_mut,ndfip1_mut,hras_mut,prps2_mut,smarcb1_mut,stmn2_mut,siah1_mut
0,0,75.65,MASTECTOMY,Breast Cancer,Breast Invasive Ductal Carcinoma,,0,claudin-low,1,Positve,...,0,0,0,0,0,0,0,0,0,0
1,2,43.19,BREAST CONSERVING,Breast Cancer,Breast Invasive Ductal Carcinoma,High,0,LumA,1,Positve,...,0,0,0,0,0,0,0,0,0,0
2,5,48.87,MASTECTOMY,Breast Cancer,Breast Invasive Ductal Carcinoma,High,1,LumB,1,Positve,...,0,0,0,0,0,0,0,0,0,0
3,6,47.68,MASTECTOMY,Breast Cancer,Breast Mixed Ductal and Lobular Carcinoma,Moderate,1,LumB,1,Positve,...,0,0,0,0,0,0,0,0,0,0
4,8,76.97,MASTECTOMY,Breast Cancer,Breast Mixed Ductal and Lobular Carcinoma,High,1,LumB,1,Positve,...,0,0,0,0,0,0,0,0,0,0


## Forma y columnas de los datos

In [None]:
print(df.shape)
print(df.columns)

## Tipos de datos de cada columna

In [None]:
df.dtypes

## Algunas columnas muestran una alerta de diferentes tipos de datos

In [None]:
df.columns.values[[678, 688, 690, 692]]

In [None]:
df[['rasgef1b_mut', 'hras_mut', 'smarcb1_mut', 'siah1_mut']].dtypes

In [None]:
df[['rasgef1b_mut', 'hras_mut', 'smarcb1_mut', 'siah1_mut']].describe()

## Las variables con la alerta indican mutaciones en los genes correspondientes. 0 = no mutacion ; x = mutacion hay valores de 0 como caracter. Estas columnas habrá que binarizarlas.

In [None]:
print(df['rasgef1b_mut'].unique())
print(df['hras_mut'].unique())
print(df['smarcb1_mut'].unique())
print(df['siah1_mut'].unique())

## Descripcion de las variables con la alerta

## Empezamos haciendo la conversión de '0' a 0 para todas las columnas donde había ambigüedad

In [None]:
df[['rasgef1b_mut', 'hras_mut', 'smarcb1_mut', 'siah1_mut']] = df[['rasgef1b_mut', 'hras_mut', 'smarcb1_mut', 'siah1_mut']].replace(['0'], 0)

In [None]:
print(df['rasgef1b_mut'].unique())
print(df['hras_mut'].unique())
print(df['smarcb1_mut'].unique())
print(df['siah1_mut'].unique())

## Algoritmo de binarización aplicado a mutaciones

In [None]:
for mut in df['rasgef1b_mut'].unique():
    if mut != 0:
        column = df['rasgef1b_mut'] == mut
        column = column.replace([False], 0).replace([True],1)
        df[mut] = column

## Histograma de edad de diagnóstico

In [None]:
plt.hist(df['age_at_diagnosis'])

## Descripción de edad de diagnóstico

In [None]:
df['age_at_diagnosis'].describe()

## Conteo de valores de las variables no genéticas

In [None]:
for i in range(2,31):
    print(i, df.columns[i])
    value_counts = df.iloc[:,i].value_counts()
    nan = df.iloc[:,i].isna().sum()
    print(value_counts)
    print('nans:',nan, '\n')

In [None]:
x1 = df.overall_survival_months[df.type_of_breast_surgery == 'MASTECTOMY']
y1 = df.age_at_diagnosis[df.type_of_breast_surgery == 'MASTECTOMY']

x2 = df.overall_survival_months[df.type_of_breast_surgery == 'BREAST CONSERVING']
y2 = df.age_at_diagnosis[df.type_of_breast_surgery == 'BREAST CONSERVING']

fig, ax = plt.subplots(1)
ax.scatter(x1, y1)
ax.scatter(x2, y2)
ax.set(xlabel='Overall survival months', ylabel='Age at diagnosis')

## Aplicando las técnicas de IA

### Regresión logística aplicada a la supervivencia en función del perfil genético

In [None]:
train_dataset = df.sample(frac=0.75, random_state=1)
test_dataset = df.drop(train_dataset.index)

max_iter = 1000
log_reg = LogisticRegression(multi_class='multinomial', max_iter=max_iter)
x_train = train_dataset.loc[:,'brca1':'ugt2b7']
x_test = test_dataset.loc[:,'brca1':'ugt2b7']

log_reg.fit(x_train, train_dataset['5_year_sup'])
predictions = log_reg.predict(x_test)
counter = predictions.tolist()
#print(counter.count('1'))
score = log_reg.score(x_test, test_dataset['5_year_sup'])
#print(score)
cm = metrics.confusion_matrix(test_dataset['5_year_sup'], predictions)
# print(cm)

fig, axs = plt.subplots(1)
axs.imshow(cm, cmap="Greens")
axs.set_title('Acc: {0}'.format(round(score,3)), fontsize = 10)
axs.set_xticks([0,1])
axs.set_yticks([0,1])
axs.set(xlabel='Predicted label', ylabel='Actual label')
for k in range(len(cm)):
        for j in range(len(cm)):
            text = axs.text(j, k, cm[k, j],
                           ha="center", va="center", color="seagreen")
fig.suptitle('Confusion matrices for death from disease classification')


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Entrenar el modelo de regresión logística
log_reg.fit(x_train, train_dataset['5_year_sup'])

# Obtener las probabilidades de predicción para la clase positiva
#print(log_reg.predict_proba(x_test))
probabilities = log_reg.predict_proba(x_test)[:, 1]

# Calcular la curva ROC
fpr, tpr, thresholds = roc_curve(test_dataset['5_year_sup'], probabilities)
roc_auc = auc(fpr, tpr)

# Imprimir el área bajo la curva ROC
print('Area bajo la curva ROC:', roc_auc)

# Graficar la curva ROC
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='Curva ROC (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Tasa de Falsos Positivos')
plt.ylabel('Tasa de Verdaderos Positivos')
plt.title('Curva ROC')
plt.legend(loc="lower right")
plt.show()

### Regresión logística aplicada para clasificar tipos de cáncer en función del perfil genético

In [None]:
train_dataset = df.dropna().sample(frac=0.75, random_state=1)
test_dataset = df.drop(train_dataset.index).dropna()

x_train_wb = train_dataset
x_test_wb = test_dataset

x_train_wb = x_train_wb[x_train_wb['cancer_type_detailed'] != 'Breast']
x_test_wb = x_test_wb[x_test_wb['cancer_type_detailed'] != 'Breast']

x_train, y_train = x_train_wb.loc[:,'brca1':'ugt2b7'], x_train_wb['5_year_sup']
x_test, y_test = x_test_wb.loc[:,'brca1':'ugt2b7'], x_test_wb['5_year_sup']

#### Binarización de los tipos de cáncer

In [None]:
print(x_train_wb['cancer_type_detailed'].unique())
print(x_test_wb['cancer_type_detailed'].unique())

y_train = pd.get_dummies(x_train_wb['cancer_type_detailed'])
y_test = pd.get_dummies(x_test_wb['cancer_type_detailed'])

#### Tamaño de los sets de entrenamiento y prueba

In [None]:
print(len(train_dataset))
print(len(test_dataset))

In [None]:
max_iter = 1000
log_reg = LogisticRegression(multi_class='multinomial', max_iter=max_iter)

x_train = x_train_wb.loc[:,'brca1':'ugt2b7']
x_test = x_test_wb.loc[:,'brca1':'ugt2b7']

fpr_cancer_type = {}
tpr_cancer_type = {}
auc_cancer_type = {}
names = {}

fig, axs = plt.subplots(2, 2)

for i, cancer_type in enumerate(x_test_wb['cancer_type_detailed'].dropna().unique()):
    
    log_reg.fit(x_train, y_train[cancer_type])
    predictions = log_reg.predict(x_test)
    score = log_reg.score(x_test, y_test[cancer_type])
    # print(score)
    cm = metrics.confusion_matrix(y_test[cancer_type], predictions)

    axs[int(i/2), i%2].imshow(cm, cmap="Greens")
    axs[int(i/2), i%2].set_title('{0}\nAcc: {1}'.format(cancer_type[0:-9] ,round(score,3)), fontsize = 10)
    axs[int(i/2), i%2].set_xticks([0,1])
    axs[int(i/2), i%2].set_yticks([0,1])
    
    probabilities = log_reg.predict_proba(x_test)[:, 1]
        
    fpr_cancer_type[i], tpr_cancer_type[i], _ = roc_curve(y_test[cancer_type], probabilities)
    auc_cancer_type[i] = auc(fpr_cancer_type[i], tpr_cancer_type[i])
    names[i] = cancer_type

    
    for k in range(len(cm)):
        for j in range(len(cm)):
            text = axs[int(i/2), i%2].text(j, k, cm[k, j],
                           ha="center", va="center", color="seagreen")
    
    for ax in axs.flat:
        ax.set(xlabel='Predicted label', ylabel='Actual label')
        ax.label_outer()

fig.suptitle('Confusion matrices for detailed cancer classification')
fig.tight_layout()
fig.subplots_adjust(wspace=0.5)

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(fpr_cancer_type[0], tpr_cancer_type[0], label='{} (AUC = {:.2f})'.format(names[0], auc_cancer_type[0]))
plt.plot(fpr_cancer_type[1], tpr_cancer_type[1], label='{} (AUC = {:.2f})'.format(names[1], auc_cancer_type[1]))
plt.plot(fpr_cancer_type[2], tpr_cancer_type[2], label='{} (AUC = {:.2f})'.format(names[2], auc_cancer_type[2]))
plt.plot(fpr_cancer_type[3], tpr_cancer_type[3], label='{} (AUC = {:.2f})'.format(names[3], auc_cancer_type[3]))

plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
score = []
score_train = []
fig, axs = plt.subplots(2, 2)

x_train = train_dataset.loc[:,'brca1':'ugt2b7']
x_test = test_dataset.loc[:,'brca1':'ugt2b7']

y_train = pd.get_dummies(train_dataset['cancer_type_detailed'])
y_test = pd.get_dummies(test_dataset['cancer_type_detailed'])

for i in range(2,6):
    clf = RandomForestClassifier(max_depth=i, random_state=0)
    clf.fit(x_train, train_dataset['cancer_type_detailed'])
    
    score_train.append(clf.score(x_train, train_dataset['cancer_type_detailed']))
    score.append(clf.score(x_test, test_dataset['cancer_type_detailed']))
    predictions = clf.predict(x_test)
    
    # print(clf.classes_)
    # print(clf.feature_importances_)
    cm = metrics.confusion_matrix(test_dataset['cancer_type_detailed'], predictions)
    # print(cm)
    axs[int((i-2)/2), i%2].imshow(cm, cmap="Greens")
    axs[int((i-2)/2), i%2].set_title('Max_depth = {0}\nAcc: {1}'.format(i ,round(score[i-2] ,3)), fontsize = 10)
    axs[int((i-2)/2), i%2].set_xticks(range(0,5))
    axs[int((i-2)/2), i%2].set_yticks(range(0,5))
    
    for k in range(len(cm)):
        for j in range(len(cm)):
            text = axs[int((i-2)/2), i%2].text(j, k, cm[k, j],
                           ha="center", va="center", color="black")
    
    for ax in axs.flat:
        ax.set(xlabel='Predicted label', ylabel='Actual label')
        ax.label_outer()
    
fig.suptitle('Confusion matrices for cancer classification')
fig.tight_layout()

In [None]:
f, ax = plt.subplots(1)
ax.plot(score)
ax.plot(score_train)
ax.set_ylim(ymin=0.7, ymax=0.9)
f.suptitle("Score curve")
plt.show(f)
print("Mejor desempeño con el conjunto de pruebas: ", max(score))

In [None]:
clf = RandomForestClassifier(max_depth=6)
clf.fit(x_train, y_train)
clf.score(x_test, y_test)
score_train = []
score_test = []

fig, axs = plt.subplots(2, 4)

for i in range(2,10):
    clf = RandomForestClassifier(max_depth=i, random_state=0)
    clf.fit(x_train, train_dataset['overall_survival'])
    predictions = clf.predict(x_test)
    score_train.append(clf.score(x_train, train_dataset['overall_survival']))
    score_test.append(clf.score(x_test, test_dataset['overall_survival']))
    # print(clf.feature_importances_)
    cm = metrics.confusion_matrix(test_dataset['overall_survival'], predictions)

    axs[int((i-2)/4), (i-2)%4].imshow(cm, cmap="Greens")
    axs[int((i-2)/4), (i-2)%4].set_title('Max_depth = {0}\nAcc: {1}'.format(i ,round(score_test[i-2] ,3)), fontsize = 10)
    axs[int((i-2)/4), (i-2)%4].set_xticks([0,1])
    axs[int((i-2)/4), (i-2)%4].set_yticks([0,1])
    
    for k in range(len(cm)):
        for j in range(len(cm)):
            text = axs[int((i-2)/4), (i-2)%4].text(j, k, cm[k, j],
                           ha="center", va="center", color="seagreen")
    
    for ax in axs.flat:
        ax.set(xlabel='Predicted label', ylabel='Actual label')
        ax.label_outer()

fig.suptitle('Confusion matrices for overall survival classification')
fig.tight_layout()

In [None]:
# Obtener las probabilidades de predicción para la clase positiva

probabilities = clf.predict_proba(x_test)[:, 1]

# Calcular la curva ROC
fpr, tpr, thresholds = roc_curve(test_dataset['overall_survival'], probabilities)
roc_auc = auc(fpr, tpr)

# Imprimir el área bajo la curva ROC
print('Area bajo la curva ROC:', roc_auc)

# Graficar la curva ROC
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='Curva ROC (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Tasa de Falsos Positivos')
plt.ylabel('Tasa de Verdaderos Positivos')
plt.title('Curva ROC')
plt.legend(loc="lower right")
plt.show()

In [None]:
f, ax = plt.subplots(1)
ax.plot(range(2,10), score_train)
ax.plot(range(2,10), score_test)
ax.set_ylim(ymin=0.5, ymax=1.1)
ax.set_title("Score curve")
ax.set(xlabel='Max_depth', ylabel='Performance')
ax.legend(["Train", "Test"], loc="upper right")

plt.show(f)
print("Mejor desempeño con el conjunto de pruebas: {0} con un árbol de profundidad {1}".format(round(max(score_test),3), score_test.index(max(score_test))+2))

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
MLP_set = df.dropna(subset=['cancer_type_detailed'])
clf = MLPClassifier(random_state=1, max_iter=300, early_stopping=True, validation_fraction=0.2).fit(MLP_set.loc[:,'brca1':'ugt2b7'], MLP_set['cancer_type_detailed'])

In [None]:
clf.predict_proba(x_test)
predictions = clf.predict(x_test)
cm = metrics.confusion_matrix(test_dataset['cancer_type_detailed'], predictions)

f, ax = plt.subplots(1)
ax.imshow(cm, cmap="Greens")
ax.set_title('Acc: {0}'.format(round(clf.score(x_test, test_dataset['cancer_type_detailed']),3)), fontsize = 10)
for k in range(len(cm)):
    for j in range(len(cm)):
        text = ax.text(j, k, cm[k, j], ha="center", va="center", color="seagreen")
ax.set(xlabel='Predicted label', ylabel='Actual label')

In [None]:
# Obtener las probabilidades de predicción para la clase positiva

prob1 = clf.predict_proba(x_test)[:,1]
prob2 = clf.predict_proba(x_test)[:,2]
prob3 = clf.predict_proba(x_test)[:,3]
prob4 = clf.predict_proba(x_test)[:,4]
prob5 = clf.predict_proba(x_test)[:,5]

# Calcular la curva ROC
fpr1, tpr1, thresholds = roc_curve(y_test['Breast Invasive Ductal Carcinoma'], prob1)
roc_auc1 = auc(fpr1, tpr1)
fpr2, tpr2, thresholds = roc_curve(y_test['Breast Invasive Lobular Carcinoma'], prob2)
roc_auc2 = auc(fpr2, tpr2)
fpr3, tpr3, thresholds = roc_curve(y_test['Breast Invasive Mixed Mucinous Carcinoma'], prob3)
roc_auc3 = auc(fpr3, tpr3)
fpr4, tpr4, thresholds = roc_curve(y_test['Breast Mixed Ductal and Lobular Carcinoma'], prob4)
roc_auc4 = auc(fpr4, tpr4)

# Graficar la curva ROC
plt.figure()
plt.plot(fpr1, tpr1, lw=2, label='Breast Invasive Ductal Carcinoma (AUC = %0.2f)' % roc_auc1)
plt.plot(fpr2, tpr2, lw=2, label='Breast Invasive Lobular Carcinoma (AUC = %0.2f)' % roc_auc2)
plt.plot(fpr3, tpr3, lw=2, label='Breast Invasive Mixed Mucinous Carcinoma (AUC = %0.2f)' % roc_auc3)
plt.plot(fpr4, tpr4, lw=2, label='Breast Mixed Ductal and Lobular Carcinoma (AUC = %0.2f)' % roc_auc4)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Tasa de Falsos Positivos')
plt.ylabel('Tasa de Verdaderos Positivos')
plt.title('Curva ROC')
plt.legend(loc="lower right")
plt.show()

In [None]:
f, ax = plt.subplots(2)
ax[0].plot(clf.loss_curve_)
ax[0].set_title('Loss curve')
ax[0].set(xlabel='Epoch')

ax[1] .plot(clf.validation_scores_)
ax[1].set_title('Scores curve')
ax[1].set(xlabel='Epoch', ylabel='Validation score')
f.suptitle("Performance curves")
plt.subplots_adjust(hspace=0.5)
fig.tight_layout()
plt.show()
print("Mejor desempeño: ", round(clf.best_validation_score_,3))

In [None]:
clf.classes_