<a href="https://colab.research.google.com/github/CleitonValandro/predictive-model-in-tumor-samples-using-artificial-intelligence/blob/task%2Fnew-improvements/PredictiveModelInTumorSamplesUsingArtificialIntelligence.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Instalação e importação de dependências**



In [None]:
#xenaPython é uma biblioteca para acessar e processar a base de dados do TCGA
# https://github.com/ucscXena/xenaPython
!pip install xenaPython
!pip install --upgrade xenaPython
!pip install pandas

In [None]:
import xenaPython as xena
import pandas as pd
import os 
import sys
import pytz
import numpy as np
import re
import matplotlib.pyplot as plt
from datetime import datetime

**Google Drive**

Conexão do Google Drive ao Colab e criação do diretório

Diretório utilizado: (content/drive/MyDrive/Google Colab/Datasets/{date and time}/...)

In [None]:
# (Opcional) Escolha um diretório específico ou algum diretório que já tenha sido gerado através do código abaixo
# Se a variável abaixo estiver vazia, será gerado um novo diretório sempre que o trecho abaixo de código for executado
# Exemplo: "/content/drive/MyDrive/Google Colab/Datasets/2023.03.23-21:40:47"
drive_path = ""

In [None]:
#Conectar Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Barra do diretório
bar = '/'
drive_path += bar

csv_name = datetime.now(pytz.timezone('America/Sao_Paulo')).strftime("%Y.%m.%d-%H:%M:%S")
google_drive_path = 'Google Colab'+bar+'Datasets'+bar+csv_name

# Gerando as pastas do diretório final
def check_dir(dir, bar):
	dir_split = dir.split(bar)
	drive_dir = os.getcwd()+bar+'drive'+bar+'MyDrive'
	for dir in dir_split:
		drive_temp_dir = drive_dir+bar+dir
		if not os.path.isdir(drive_temp_dir):
			os.mkdir(drive_temp_dir)
		drive_dir = drive_temp_dir
	return drive_dir+bar

if not drive_path:
	drive_path = check_dir(google_drive_path, bar)

print(drive_path)

**Importação dos dados do TCGA via Xena Browser**

Nesta etapa, escolhemos os datasets que iremos trabalhar (Gene de expressão, fenotipos e sitios de metilação)

Agrupa todas as inforações em uma só planilha onde posteriormente, a mesma é salva no diretório do Google Drive

In [None]:
#Gene escolhido
chosen_gene_code = "ENSG00000135318.11";
chosen_gene_name = "NT5E";

#Selecionando a base de dados
#Gene de expressão
gene_expression_hub = "https://toil.xenahubs.net"
gene_expression_dataset = "tcga_RSEM_gene_fpkm"

#Fenotipos
phenotype_hub = "https://pancanatlas.xenahubs.net"
phenotype_dataset = "Survival_SupplementalTable_S1_20171025_xena_sp"

#DNA methylation (Methylation27K)
_27_methylation_hub = "https://tcga.xenahubs.net"
_27_methylation_dataset = "TCGA.PANCAN.sampleMap/HumanMethylation27"
_27_methylation_sitios = ['cg17488985', 'cg17966619']

#DNA methylation (Methylation450K)
_450_methylation_hub = "https://pancanatlas.xenahubs.net"
_450_methylation_dataset = "jhu-usc.edu_PANCAN_HumanMethylation450.betaValue_whitelisted.tsv.synapse_download_5096262.xena"
_450_methylation_sitios = ['cg27039625', 'cg17644557', 'cg13315970', 'cg21730993', 'cg10663055', 'cg17488985', 'cg24635468', 'cg23157089', 'cg17966619', 'cg27297263', 'cg00925339', 'cg23172664', 'cg24702826', 'cg09989847']

gene_found = False
number_samples = 0
genes = []
samples = []

genes = xena.dataset_field (gene_expression_hub, gene_expression_dataset)

# Verifica se existe o gene escolhido
for gene in genes:
  if gene == chosen_gene_code:
    gene_found = True

if gene_found == False:
    print("Gene "+chosen_gene_code+" não foi encontrado!")  
    exit()
else:
    print("Gene "+chosen_gene_code+" foi encontrado!")  

# Verifica o fenotipo correspondente a amostra
phenotypes = xena.dataset_field (phenotype_hub, phenotype_dataset)

# Reune as amostras
number_samples = xena.dataset_samples_n_dense_matrix(gene_expression_hub, gene_expression_dataset)
samples = xena.dataset_samples (gene_expression_hub, gene_expression_dataset, number_samples[0])

# Cria as colunas na planilha
columns = ['Sample', chosen_gene_code]
for phenotype in phenotypes:
    columns.append(phenotype)
for sitio_methylation_450 in _450_methylation_sitios:
    columns.append(sitio_methylation_450)
for sitio_methylation_27 in _27_methylation_sitios:
    columns.append(sitio_methylation_27)
df = pd.DataFrame(columns=columns)

# Amostras e valor correspondente
punctuation = xena.dataset_probe_values(gene_expression_hub, gene_expression_dataset, samples, [chosen_gene_code])
# Fenotipos correspondente a amostra
phenotype_values = xena.dataset_probe_values(phenotype_hub, phenotype_dataset, samples, phenotypes)
# Metilação correspondente (Methylation450K)
_450_methylation_values = xena.dataset_probe_values(_450_methylation_hub, _450_methylation_dataset, samples, _450_methylation_sitios)
# Metilação correspondente (Methylation27K)
_27_methylation_values = xena.dataset_probe_values(_27_methylation_hub, _27_methylation_dataset, samples, _27_methylation_sitios)

# Verifica as amostras correspondentes ao gene escolhido
for index, sample in enumerate(samples):
    # Amostras e valor correspondente
    row = ''
    row = [sample, punctuation[1][0][index]]
    
    # Fenotipos correspondente a amostra
    for phenotype_value in phenotype_values[1]:
       row.append(phenotype_value[index])
  
    # Metilação correspondente (Methylation450K)
    for _450_methylation_value in _450_methylation_values[1]:
        row.append(_450_methylation_value[index])

    # Metilação correspondente (Methylation27K)
    for _27_methylation_value in _27_methylation_values[1]:
        row.append(_27_methylation_value[index])

    # Incrementa uma nova linha na planilha
    df.loc[index] = row

# Salva a planilha
df.to_csv(drive_path+"imported_samples", sep = ',')        

**Exibe a planilha**

Planilha que foi anteriormente salva no Google Drive

In [None]:
RawData = pd.read_csv(drive_path+"imported_samples", sep=',')
display(RawData)

**Realiza a organização dos dados**

Pares e amostras simples

Salva ambas as planilhas no diretório do Google Colab

In [None]:
#Gera duas planilhas, uma com as amostras combinadas e outra com dados simples(Sem combinações)
RawData = pd.read_csv(drive_path+"imported_samples", sep=',')
dfList = RawData
dfPairedSample = pd.DataFrame(columns=RawData.columns.values.tolist())
dfSimpleSamples = pd.DataFrame(columns=RawData.columns.values.tolist())

dfList['reading'] = 0
dfList['pairing'] = 0
dfPairedSample.insert(0, 'pairing', '')
pairing = 1

# Função que realiza o agrupamento das amostras
def check_corresponding(sample_exists):
  item_found = []
  for index, item in enumerate(sample_exists):
     if item == True:
       inx = dfList.loc[index]['Sample'][13]+''+dfList.loc[index]['Sample'][14]
       item_found.insert(int(inx), (index, dfList.loc[index]['Sample']))  
  if len(item_found) > 1:
    # Organiza os dados (01, 06, 11..) de forma ordenada
    item_found = sorted(item_found, key = lambda x: (x[1]))    
  return item_found

for index, item in enumerate(dfList.iterrows()):
  #if index < 100:
    sample_exists = False
    sample_exists = dfList ['Sample']. str.contains (item[1]['Sample'][:12], case = False)
    check_corresponding_index = check_corresponding(sample_exists)
    if len(check_corresponding_index) > 1:
      # Quando for encontrado conjunto de combinações
      found = 0
      for item_check in check_corresponding_index:
        if dfList.loc[item_check[0]]['reading'] == 0:
          dfList.loc[item_check[0], 'pairing'] = str(pairing)+'.'+item_check[1][13]+item_check[1][14]
          dfPairedSample.loc[len(dfPairedSample)] = dfList.loc[item_check[0]]
          dfList.loc[item_check[0], 'reading'] = 1
          found = 1
      if found == 1:
        pairing = pairing + 1
    else:
      # Quando for encontrado amostras simples(Sem combinações)
      if dfList.loc[check_corresponding_index[0][0]]['reading'] == 0:
        dfSimpleSamples.loc[len(dfSimpleSamples)] = dfList.loc[check_corresponding_index[0][0]]
        dfList.loc[check_corresponding_index[0][0], "reading"] = 1

display(dfSimpleSamples) 
display(dfPairedSample) 

# Salva a planilha
dfSimpleSamples.to_csv(drive_path+"unpaired_samples", sep = ',') 
dfPairedSample.to_csv(drive_path+"paired_samples", sep = ',')   

**Informações/códigos úteis**

In [None]:
methylation = ['cg27039625', 'cg17644557', 'cg13315970', 'cg21730993', 'cg10663055', 'cg17488985', 'cg24635468', 'cg23157089', 'cg17966619', 'cg27297263', 'cg00925339', 'cg23172664', 'cg24702826', 'cg09989847', 'cg17488985.1', 'cg17966619.1']
type_tumor_name = [("1.0", "BLCA"), ("2.0", "BRCA"), ("3.0", "CESC"), ("4.0", "CHOL"), ("5.0", "COAD"), ("7.0", "ESCA"), ("8.0", "GBM"), ("9.0", "HNSC"), ("10.0", "KICH"), ("11.0", "KIRC"), ("12.0", "KIRP"), ("14.0", "LGG"), ("15.0", "LIHC"), ("16.0", "LUAD"), ("17.0", "LUSC"), ("19.0", "OV"), ("20.0", "PAAD"), ("21.0", "PCPG"), ("22.0", "PRAD"), ("23.0", "READ"), ("24.0", "SARC"), ("25.0", "SKCM"), ("26.0", "STAD"), ("27.0", "TGCT"), ("28.0", "THCA"), ("29.0", "THYM"), ("30.0", "UCEC")]


**Estatistica descritiva**

In [None]:
#Tamanho do arquivo resultante
dir_path = drive_path+"paired_samples"
f_path = os.path.join(dir_path)
f_size = os.path.getsize(f_path)
f_size_kb = f_size/1024
print('Tamanho do arquivo resultante:')
print(str(int(f_size_kb))+' kb')

In [None]:
#Quantos atributos (Colunas) e quantas estâncias (Linhas) tem na planilha
data = pd.read_csv(drive_path+"paired_samples", sep=',')
print('Quantidade de linhas e colunas:')
print(str(data[data.columns[0]].count())+' linhas') 
print(str(len(data.columns.values.tolist()))+' colunas') 

In [None]:
#Quantidade de amostras e tipos tumorais
temp_data = []
temp_data.append(["Sample", drive_path+"imported_samples", "Imported samples - Quantity in sample types:"])
temp_data.append(["Sample", drive_path+"paired_samples", "Paired samples - Quantity in sample types:"])
temp_data.append(["cancer type abbreviation", drive_path+"imported_samples", "Imported samples - Number of tumor types:"])
temp_data.append(["cancer type abbreviation", drive_path+"paired_samples", "Paired samples - Number of tumor types:"])
data_column = ['Type', 'Quantity', 'Percentage']
number_sample_type = [[],[],[],[]]

for idx, data_item in enumerate(temp_data):
  data = pd.read_csv(str(data_item[1]), sep=',')
  def check_exists(type_sample):
    for index, item in enumerate(number_sample_type[idx]):
      if str(item[0]) == str(type_sample):
        number_sample_type[idx][index][1] = int(number_sample_type[idx][index][1])+1
        return True
    return False
  for index, item in enumerate(data.iterrows()):
    if str(data_item[0]) == "Sample":
      type_sample = item[1][str(data_item[0])][13]+''+item[1][str(data_item[0])][14]
    if str(data_item[0]) == "cancer type abbreviation":
      type_sample = item[1][data_item[0]]

    if check_exists(type_sample) == False:
      number_sample_type[idx].append([type_sample, 1])
  number_sample_type[idx] = sorted(number_sample_type[idx], key = lambda x: (x[0]))
  for index, item in enumerate(number_sample_type[idx]):
    percentage = ((number_sample_type[idx][index][1])/(data[data.columns[0]].count()))*100
    number_sample_type[idx][index].append(percentage)

  print(data_item[2])
  print(str(data[data.columns[0]].count())+' samples') 

  df = pd.DataFrame(number_sample_type[idx])
  df.columns = data_column
  df = pd.concat([df], ignore_index=True)
  display(df)
  print()

  grupos = []
  valores = []
  for index, item in enumerate(number_sample_type[idx]):
    grupos.append(item[0])
    valores.append(item[1])

  plt.bar(grupos, valores)
  plt.show()
  print()

In [None]:
#Sítios de metilação
vector_merge = [0, 1]
methylation_data = []
columns_methylation_data = ['Methylation', 'Quantity', 'Percentage']
methylation_information = ['Original samples', 'Paired samples']

for idx, vector_merge_item in enumerate(vector_merge):
  df = pd.read_csv(str(temp_data[vector_merge_item][1]), sep=',')
  methylation_data.append([])
  for index, methylation_item in enumerate(methylation):
    col_one_list = df[str(methylation_item)].tolist()
    methylation_data[idx].append([])
    methylation_data[idx][index].append(methylation_item)
    with_value = 0
    for vetor_item in col_one_list:
      if vetor_item > 0:
        with_value = with_value + 1
    methylation_data[idx][index].append(with_value)
    methylation_data[idx][index].append(((with_value)/(df[df.columns[0]].count()))*100)

  print(methylation_information[idx])
  print()
  dfm = pd.DataFrame(methylation_data[idx])
  dfm.columns = columns_methylation_data
  dfm = pd.concat([dfm], ignore_index=True)
  display(dfm)
  print()
  number_sample_type.append(methylation_data[idx])

In [None]:
#Geração de gráficos
vector_merge = [[0, 1], [2, 3], [4, 5]]
vector_merge_names = ["Types of samples", "Tumor types", "Methylation levels"]

def check_exists(type_sample, index):
  for index, item in enumerate(number_sample_type[index]):
    if str(item[0]) == str(type_sample):
      return item[1]
  return 0
for idx, vector_merge_item in enumerate(vector_merge):
  vector_merge_type = []
  values_A = []
  values_B = []

  for index, item in enumerate(number_sample_type[vector_merge_item[0]]):
    vector_merge_type.append(item[0])
    values_A.append(item[1])
    values_B.append(check_exists(item[0], vector_merge_item[1]))

  # Cria o eixo x e atribui a tamanho da separação entre as barras
  fig=plt.figure(figsize=(25, 10), dpi= 65, facecolor='w', edgecolor='k')
  x1 =  np.arange(len(values_A))
  
  x2 = [x + 0.40 for x in x1]

  # Plota as barras
  plt.bar(x1, values_A, width=0.40, label = 'Imported samples', color = 'blue')
  plt.bar(x2, values_B, width=0.40, label = 'Paired samples', color = 'red')

  plt.xticks([x + 0.45 for x in range(len(values_A))], vector_merge_type)

  # Inserir a legenda no gráfico
  plt.legend()
  plt.title(vector_merge_names[idx])
  
  plt.show()

  # Cria um espaço entre os gráficos
  print()

**Seleção dos dados**

Agrupamento das amostras

In [None]:
dfList = data
dfList['reading'] = 0
paired_list = []

# Função que realiza o agrupamento das amostras
def check_corresponding(sample_exists):
  item_found = []
  for index, item in enumerate(sample_exists):
     if item == True:
       inx = dfList.loc[index]['Sample'][13]+''+dfList.loc[index]['Sample'][14]
       item_found.insert(int(inx), (index, dfList.loc[index]['Sample']))  
  if len(item_found) > 1:
    # Organiza os dados (01, 06, 11..) de forma ordenada
    item_found = sorted(item_found, key = lambda x: (x[1]))    
  return item_found
def remove_duplicates(lista):
    l = []
    for i in lista:
        if i not in l:
            l.append(i)
    l.sort()
    return l
for index, item in enumerate(dfList.iterrows()):
  sample_exists = False
  sample_exists = dfList ['Sample']. str.contains (item[1]['Sample'][:12], case = False)
  check_corresponding_index = check_corresponding(sample_exists)
  if len(check_corresponding_index) > 1:
    # Quando for encontrado conjunto de combinações
    paired_list.append(check_corresponding_index)
paired_list = remove_duplicates(paired_list)
print(paired_list)

Removendo as amostras que não serão utilizadas

In [None]:
temp_paired_list = []

for x_index, x_item in enumerate(paired_list):
  item_size_x = len(x_item)
  final_sample = []
  for y_index, y_item in enumerate(x_item):
    final_sample.append(y_item[1][13:])  
  if item_size_x == 3:
    if '01' in final_sample and '06' in final_sample and '11' in final_sample:
      paired_list[x_index].pop(final_sample.index('06'))
      temp_paired_list.append(paired_list[x_index])
      continue
  elif item_size_x == 2:
    if ('05' in final_sample and '11' in final_sample):
      temp_paired_list.append(paired_list[x_index])
      continue
    elif ('01' in final_sample and '11' in final_sample):
      temp_paired_list.append(paired_list[x_index])
      continue

paired_list = temp_paired_list
print(paired_list)

df_temp = pd.DataFrame(columns=data.columns.values.tolist())
df_temp_index = 0
for x_index, x_item in enumerate(paired_list):
  for y_index, y_item in enumerate(x_item):
    df_temp.loc[df_temp_index] = data.loc[y_item[0]]
    df_temp_index = df_temp_index + 1
  
data = df_temp 

# Salva a planilha
data.to_csv(drive_path+"paired_and_filtered", sep = ',')

Organiza os dados para gerar o Boxplot de cada tipo tumoral

In [None]:
df_cancer_type_abbreviation = data
type_abbreviation = number_sample_type[3]

name_type = []
types_values = []
for x_index, x_item in enumerate(type_abbreviation):
  if str(x_item[0]) != str('nan'):
    df_temp = pd.DataFrame(columns=data.columns.values.tolist())
    name_type.append(str(x_item[0]))
    types_values.append([[],[]])
    for y_index, y_item in df_cancer_type_abbreviation.iterrows():
      if y_item['cancer type abbreviation'] == x_item[0]:
        df_temp.loc[y_index] = y_item
        type_indice = y_item['Sample'][13]+y_item['Sample'][14]
        if type_indice == '01' or type_indice == '05':
          types_values[x_index][1].append(y_item['ENSG00000135318.11'])
        if type_indice == '11':
          types_values[x_index][0].append(y_item['ENSG00000135318.11'])
    display(df_temp)

**Boxplot para cada tipo tumoral**

In [None]:
import plotly.graph_objects as go
tumor_name_list = []

colors = ['royalblue', 'indianred']
name_description = ['Non-tumor', 'Tumor']

for x_index, x_item in enumerate(types_values):
  tumor_name_list.append(len(x_item[0]))
  fig = go.Figure()

  for xd, yd, cls in zip(name_description, types_values[x_index], colors):
        fig.add_trace(go.Box(
            y=yd,
            name=xd,
            boxpoints='all',
            jitter=0.5,
            whiskerwidth=0.2,
            fillcolor=cls,
            marker_size=7,
            line_width=1)
        )

  tumor_name = ""
  for name_index, name_item in enumerate(type_tumor_name):
    if name_item[0] == str(name_type[x_index]):
      tumor_name = name_item[1]
    
  fig.update_layout(
    title=('Tumor type: '+name_type[x_index]+" - "+tumor_name),
    xaxis=dict(title=('N = '+str(len(x_item[0]))), zeroline=False),
    yaxis=dict(
        autorange=True,
        showgrid=True,
        zeroline=True,
        dtick=5,
        gridcolor='rgb(255, 255, 255)',
        gridwidth=1,
        zerolinecolor='rgb(255, 255, 255)',
        zerolinewidth=2,
    ),
    margin=dict(
        l=40,
        r=30,
        b=80,
        t=100,
    ),
    paper_bgcolor='rgb(243, 243, 243)',
    plot_bgcolor='rgb(243, 243, 243)',
    showlegend=True
  )

  fig.show()

  # Espaço entre os gráficos
  print()

**Estatística descritiva e análise estatística das amostras pareadas**

In [None]:
df_temp = pd.DataFrame(columns=['Sample', 'P VALUE', 'N', 'NT_mean', 'NT_median', 'NT_sd', 'NT_min', 'NT_max', 'T_mean', 'T_median', 'T_sd', 'T_min', 'T_max'])

import statistics
from math import sqrt
from numpy.random import seed
from numpy.random import randn
from numpy import mean
from scipy.stats import sem
from scipy.stats import t

def independent_ttest(non_tumor_data, tumor_date):
  mean1, mean2 = mean(non_tumor_data), mean(tumor_date)
  se1, se2 = sem(non_tumor_data), sem(tumor_date)
  sed = sqrt(se1**2.0 + se2**2.0)
  t_stat = (mean1 - mean2) / sed
  df = len(non_tumor_data) + len(tumor_date) - 2
  p = (1.0 - t.cdf(abs(t_stat), df)) * 2.0
  return p

def p_value(non_tumor_data, tumor_date):
  if non_tumor_data != [] and tumor_date != []:
    p = independent_ttest(non_tumor_data, tumor_date)
    return ('%.4f' % (p))
  else:
    return ''

for x_index, x_item in enumerate(name_type):
  nome_tumor = ''
  for name_index, name_item in enumerate(type_tumor_name):
    if name_item[0] == str(name_type[x_index]):
      nome_tumor = name_item[1]

  non_tumor_data = types_values[x_index][0]
  tumor_date = types_values[x_index][1]
  
  # p-value
  P_value =  p_value(non_tumor_data, tumor_date)

  # Non-tumor
  NT_mean = statistics.mean(non_tumor_data) if non_tumor_data != [] else ''
  NT_median = statistics.median(non_tumor_data) if non_tumor_data != [] else ''
  NT_sd = np.std(non_tumor_data) if non_tumor_data != [] else ''
  NT_min = min(non_tumor_data) if non_tumor_data != [] else ''
  NT_max = max(non_tumor_data) if non_tumor_data != [] else ''

  # Tumor
  T_mean = statistics.mean(tumor_date) if tumor_date != [] else ''
  T_median = statistics.median(tumor_date) if tumor_date != [] else ''
  T_sd = np.std(tumor_date) if tumor_date != [] else ''
  T_min = min(tumor_date) if tumor_date != [] else ''
  T_max = max(tumor_date) if tumor_date != [] else ''

  df_temp.loc[x_index] = [(x_item+' - '+nome_tumor), P_value, tumor_name_list[x_index], NT_mean, NT_median, NT_sd, NT_min, NT_max, T_mean, T_median, T_sd, T_min, T_max]  # adding a row

display(df_temp)

**Seleção dos tipos tumorais que possuem relavância para a aplicação de machine learning**

Removendo os tipos tumorais sem relevância (P VALUE menor ou igual a 0.01)

In [None]:
for x_index, x_item in enumerate(df_temp.iterrows()):
  if int(x_item[1]['N']) == 0:
    df_temp = df_temp.drop(x_index, axis=0)
  elif float(x_item[1]['P VALUE']) > 0.01:
    df_temp = df_temp.drop(x_index, axis=0)
# Ordenando os tipos tumorais através do número de pares. 
df_temp_p_value = df_temp.sort_values(by='N', ascending=False)
df_relevant_tumor_types = df_temp_p_value
display(df_relevant_tumor_types)

**Gerando boxplot simplificado dos tipos tumorais relavantes**

In [None]:
import seaborn as sns
sns.set_theme(style="whitegrid")
tips = sns.load_dataset("tips")

tumor_name_list = []

colors = ['royalblue', 'indianred']
name_description = ['Non-tumor', 'Tumor']

for x_index, x_item in enumerate(types_values):
  nome_tumor = ""
  for name_index, name_item in enumerate(type_tumor_name):
    if name_item[0] == str(name_type[x_index]):
      nome_tumor = name_type[x_index]+" - "+name_item[1]
  
  data_temp = {'Non-tumor':  types_values[x_index][0],
        'Tumor': types_values[x_index][1]}
  df = pd.DataFrame(data_temp)
  
  result = df_relevant_tumor_types['Sample'].str.contains("2.0 - BRCA")
  exists = nome_tumor in df_relevant_tumor_types.values
  if exists == True:
    plt.figure( figsize=(5, 6))
    sns.boxplot(x="variable", y="value", data=pd.melt(df), palette={"Non-tumor": "#2b76ca", "Tumor": "#b10026"})
    sns.swarmplot(x="variable", y="value", data=pd.melt(df), color="#140f07")
    plt.title(nome_tumor, loc="left")
    plt.show()

    # Espaço entre os gráficos
    print()

**Machine Learning**

Removendo as colunas não utilizados e adicionando o diagnóstico nas amostras de cada tipo tumoral

In [None]:
df_temp_colunas_selecionadas = ['Sample', 'ENSG00000135318.11', 'cancer type abbreviation']
for x_item in methylation:
  df_temp_colunas_selecionadas.append(x_item)

df_temp = data[df_temp_colunas_selecionadas]
df_temp['Sample type'] = ''

for x_index, x_item in df_temp.iterrows():
  type_indice = x_item['Sample'][13]+x_item['Sample'][14]
  if type_indice == '01' or type_indice == '05':
    df_temp.loc[x_index, 'Sample type'] = 'Tumor Sample'
  elif type_indice == '11':
    df_temp.loc[x_index, 'Sample type'] = 'Normal Sample'
display(df_temp)
data = df_temp

**Aplicação do Machine Learning**

Hold-out simples ou com grupo de validação

Geração da Curva Roc

In [None]:
## Configurações iniciais (Escolha da técnica e dos tipos tumorais que serão analisados)
# Tumor específico ou 'all' para analisar todos os tipos tumorais
tumor_type = 'all'

# Escolha da técnica utilizada (holdout_simple ou holdout_validation)
model_validation_techniques = "holdout_simple"

# parâmetros
random_state_parameter = 14
test_size_parameter = 0.50

In [None]:
# Separação das amostras por tipo tumoral conforme a seleção e os testes realizados
# Aplicação do machine learning em cada tipo tumoral

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report

from sklearn.svm import SVC
from sklearn.metrics import RocCurveDisplay

from pandas import DataFrame

import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt

methylation_genes = ['ENSG00000135318.11']+methylation

# Criando um data frame para cada tipo de métrica
# Métricas (accuracy, precision, recall e f1)
df_temp = methylation_genes.copy()
df_temp.insert(0, 'tumor type')
df_accuracy = pd.DataFrame(columns=df_temp)
df_precision = pd.DataFrame(columns=df_temp)
df_recall = pd.DataFrame(columns=df_temp)
df_f1 = pd.DataFrame(columns=df_temp)


def tipo_tumoral_random_forest(df_tipo_tumoral, nomenclatura_tipo_tumoral, tumor_index):
  # Valida o tipo tumoral escolhido para a análise
  nomenclatureSplit = nomenclatura_tipo_tumoral.split('-')
  if tumor_type != 'all' and nomenclatureSplit[1].strip(" ") not in tumor_type:
    return

  print('Tumor type: '+nomenclatura_tipo_tumoral)
  for index, gene in enumerate(methylation_genes):
    # Cria a tabelas temporárias para cada análise (Gene e (Gene + metilação))
    df_temp = create_df_temp(df_tipo_tumoral, gene)
    # Remove os items apenas dos data fremes temporários que tiverem metilação inclusa
    if index > 0:
      df_temp = remove_samples_no_value(df_temp)
    # Caso não tiver amostras, utiliza um 0 nos data frames de métricas
    if len(df_temp) == 0:
      insert_value_metrics(tumor_index, gene, [0,0,0,0])
      continue
    # Machine learning para cada combinação e tipo tumoral
    applying_machine_learning(df_temp, index, gene, nomenclatura_tipo_tumoral, tumor_index)

# Criando as seleções de genes
def create_df_temp(df_tipo_tumoral, current_gene):
  selected_columns = ['ENSG00000135318.11']
  if current_gene != selected_columns[0]:
    selected_columns.insert(1, current_gene)
  selected_columns.insert(2, 'Sample type')
  selected_columns.insert(3, 'Sample')
  df_temp = df_tipo_tumoral[selected_columns]
  return df_temp

# Removendo amostras que não possuem um valor (nan)
def remove_samples_no_value(df_gene_selection):
  for index, row in df_gene_selection.iterrows():
    if str(row[1]) == str('nan'):
      df_gene_selection = df_gene_selection.drop(index)
  return df_gene_selection

# Aplicação do machine learning
def applying_machine_learning(df_gene_selection, index, gene, nomenclatura_tipo_tumoral, tumor_index):
  X_columns = ['ENSG00000135318.11'] if index == 0 else ['ENSG00000135318.11', gene]
  X = df_gene_selection[X_columns]
  y = df_gene_selection['Sample type']

  # Método Hold-out de forma simples, sem uma nova divisão do treinamento em validação
  if model_validation_techniques == 'holdout_simple':
    X_train, X_test, y_train, y_test, y_pred = holdout_simple(X, y, random_state_parameter)
  
  # Método Hold-out com uma nova divisão do treinamento em validação
  if model_validation_techniques == 'holdout_validation':
    X_train, X_test, y_train, y_test, y_pred = holdout_with_validation(X, y)

  # metricas resultantes
  value_metrics = [
      metrics.accuracy_score(y_test, y_pred),
      metrics.precision_score(y_test, y_pred, average='macro'),
      metrics.recall_score(y_test, y_pred, average='macro'),
      metrics.f1_score(y_test, y_pred, average='macro')
  ]

  # Incluindo o tipo tumoral nos data frames
  insert_tumor_type(index, tumor_index, nomenclatura_tipo_tumoral)
  # Incluindo os valores(métricas) nos data frames
  insert_value_metrics(tumor_index, gene, value_metrics)

  # Curva Roc
  # Amostras normais
  roc_curve(df_gene_selection, tumor_index, index, gene, X_train, y_train, X_test, y_test, "Normal Sample", "Normal samples")
  # Amostras tumorais
  roc_curve(df_gene_selection, tumor_index, index, gene, X_train, y_train, X_test, y_test, "Tumor Sample", "Tumor samples")

# Hold-out simples
def holdout_simple(X, y, randon_state):
  # 50% training and 50% test
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size_parameter, random_state = randon_state)
  clf=RandomForestClassifier(n_estimators=100)
  clf.fit(X_train,y_train)
  y_pred=clf.predict(X_test)
  return X_train, X_test, y_train, y_test, y_pred

# Hold-out com a divisão do treinamento em um novo grupo de validação
def holdout_with_validation(X, y):
  count = 0
  better_model_accuracy = []

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size_parameter, random_state = random_state_parameter)
  
  while count <= 100:
    X_train_v, X_test_v, y_train_v, y_test_v, y_pred_v = holdout_simple(X_train, y_train, count)
    better_model_accuracy.append(metrics.accuracy_score(y_test_v, y_pred_v))
    count = count + 1
  
  best_parameter = better_model_accuracy.index(max(better_model_accuracy))
  X_train_f, X_test_f, y_train_f, y_test_f, y_pred_f = holdout_simple(X_train, y_train, best_parameter)

  clf=RandomForestClassifier(n_estimators=100)
  clf.fit(X_train_f,y_train_f)
  y_pred=clf.predict(X_test)
  return X_train_f, X_test, y_train_f, y_test, y_pred

# Gerando a curva roc para cada análise
def roc_curve(df_gene_selection, tumor_index, index, gene, X_train, y_train, X_test, y_test, pos_label, ylabel):
  rows_count = len(X_train.index)
  # Gera a curva apenas para o tipo tumral THCA
  if rows_count <= 15 or tumor_index != 2:
    return 
  description = gene if index == 0 else "ENSG00000135318.11 + "+gene
  print()

  clf = SVC(random_state=random_state_parameter).fit(X_train, y_train)

  RocCurveDisplay.from_estimator(clf, X_test, y_test, pos_label=pos_label)
  plt.title(description)
  plt.xlabel('')
  plt.ylabel(ylabel)
  plt.show()

# Inserindo os tipos tumorais dentro das métricas
def insert_tumor_type(index, tumor_index, nomenclatura_tipo_tumoral):
  if index == 0:
    df_accuracy.loc[tumor_index, 'tumor type'] = nomenclatura_tipo_tumoral
    df_precision.loc[tumor_index, 'tumor type'] = nomenclatura_tipo_tumoral
    df_recall.loc[tumor_index, 'tumor type'] = nomenclatura_tipo_tumoral
    df_f1.loc[tumor_index, 'tumor type'] = nomenclatura_tipo_tumoral

# Inserido os resultados em cada métrica
def insert_value_metrics(tumor_index, gene, value_metrics):
  df_accuracy.loc[tumor_index, str(gene)] = value_metrics[0]
  df_precision.loc[tumor_index, str(gene)] = value_metrics[1]
  df_recall.loc[tumor_index, str(gene)] = value_metrics[2]
  df_f1.loc[tumor_index, str(gene)] = value_metrics[3]

# Seleção por tipo tumoral
index = 0
for x_index, x_item in enumerate(df_relevant_tumor_types.iterrows()):
  df_tipo_tumoral = False
  df_tipo_tumoral = pd.DataFrame(columns=data.columns.values.tolist())
  tipo_tumoral_relevante = x_item[1]['Sample'].split('-')

  # Seleção das amostras através do tipo tumoral (x_item)
  tipo_tumoral_relevante_index = 0
  for y_index, y_item in enumerate(data.iterrows()):
    if str(tipo_tumoral_relevante[0].strip(" ")) == str(y_item[1]['cancer type abbreviation']):
      df_tipo_tumoral.loc[tipo_tumoral_relevante_index] = data.loc[y_index]
      tipo_tumoral_relevante_index = tipo_tumoral_relevante_index + 1
  tipo_tumoral_random_forest(df_tipo_tumoral, x_item[1]['Sample'], index)
  index=index+1

**Heat maps**

In [None]:
def delete_column(df_metrics, name_column):
  if name_column in df_metrics.columns:
    df_metrics = df_metrics.drop(name_column, 1)
  return df_metrics

def delete_row(df_metrics, name_row):
  metrics_row = df_metrics[df_metrics["tumor type"]==name_row].index
  df_metrics = df_metrics.drop(metrics_row)
  return df_metrics  

def display_heat_map(df_metrics, type_metrics):
  print('')
  print(type_metrics)
  
  df_metrics = delete_column(df_metrics, 'cg17488985.1')
  df_metrics = delete_column(df_metrics, 'cg17966619.1')
  df_metrics = delete_row(df_metrics, '10.0 - KICH') 

  df_tumor_types = df_metrics['tumor type'].values.tolist()
  df_metrics = delete_column(df_metrics, 'tumor type')
  
  df_cols = ['ENSG00000135318.11', 'cg27039625', 'cg17644557', 'cg13315970', 'cg21730993', 'cg10663055', 'cg17488985', 'cg24635468', 'cg23157089', 'cg17966619', 'cg27297263', 'cg00925339', 'cg23172664', 'cg24702826', 'cg09989847']

  df_formatted = pd.DataFrame(df_metrics.values.tolist(), index=df_tumor_types, columns=df_cols)
  plt.figure(1, figsize=(20, 13))
  cmap = sns.cm.rocket_r
  sns.heatmap(df_formatted, annot=True, cmap=cmap)
  plt.show()

display_heat_map(df_accuracy, 'Accuracy')
display_heat_map(df_precision, 'Precision')
display_heat_map(df_recall, 'Recall')
display_heat_map(df_f1, 'F1')