# Obtención de datos

En este notebook se obtienen los perfiles de metilación (Illumina Infinium 27k) de la base de datos TCGA-BRCA del portal GDC. Así como el manifiesto del chip Illumina Infinium 450 K

## Importar librerías y definir paths

In [4]:
import sys
from pathlib import Path

# in jupyter (lab / notebook), based on notebook path
src_path = str(Path.cwd().parents[0] / "src")
# Para poder importar "get_data"
if src_path not in sys.path:
    sys.path.append(src_path)

from get_data import download_gdc_data, get_gdc_files_data, concatenate_gdc_files, download_files_from_gd
import json
import pandas as pd
import shutil
import gdown

In [9]:
# Definición de paths
project_path = str(Path.cwd().parents[0])
relative_folder = "data/raw"
base_path = f"{project_path}/{relative_folder}"
# Paths para datos de illumina 27k
unpack_folder = f"{base_path}/tcga"
concatenated_path = f"{base_path}/brca_27k.csv"
# Path para manifiesto
manifest_path = f"{base_path}/manifest.csv"


## Obtención de datos GDC

### Obtención de metadata GDC

In [None]:
in_filters = {"files.cases.primary_site":'breast', "files.data_category":"dna methylation", "files.data_format":"txt",
              "files.data_type":"Methylation Beta Value", "files.platform":'illumina human methylation 27'}

additional_fields = ["file_id","cases.samples.sample_type","cases.disease_type","file_size", 'cases.case_id',
                     "cases.diagnoses.days_to_birth",
                     "cases.demographic.year_of_birth",
                     "cases.demographic.race",
                     "cases.demographic.ethnicity",
                     "cases.diagnoses.age_at_diagnosis",
                     "cases.diagnoses.updated_datetime",
                     "cases.primary_site", "cases.tissue_source_site", "cases.diagnoses.tissue_or_organ_of_origin",
                     "cases.diagnoses.site_of_resection_or_biopsy", "cases.annotations.notes"]

response = get_gdc_files_data(in_filters, additional_fields)
json_response = json.loads(response.content.decode("utf-8"))["data"]["hits"]

In [10]:
df_list = []
# Obtener datos y guardarlos en lista de diccionarios
for file_entry in json_response:
      file_id = file_entry["file_id"]
      file_size = int(file_entry['file_size'])
      sample_type = file_entry['cases'][0]['samples'][0]['sample_type']
      disease_type = file_entry['cases'][0]['disease_type']
      age = file_entry['cases'][0]['diagnoses'][0].get('age_at_diagnosis', None)
      race = file_entry['cases'][0]['demographic'].get('race', None)
      etnia = file_entry['cases'][0]['demographic'].get('ethnicity', None)
      df_list.append({'file_id':file_id, 'size':file_size, 'disease':disease_type,
                       'sample_type':sample_type, 'age':age, 'race':race, 'etnia':etnia})
df = pd.DataFrame(df_list)

In [12]:
df.head()

Unnamed: 0,file_id,size,disease,sample_type,age,race,etnia
0,6877b045-91f1-4030-82ff-b90507e11e17,770500,Ductal and Lobular Neoplasms,Primary Tumor,18594.0,white,not hispanic or latino
1,63e8a64e-b608-4a19-adb5-e62924b88b9a,765566,Ductal and Lobular Neoplasms,Primary Tumor,23878.0,white,not hispanic or latino
2,b94af4b6-0111-4931-9ee4-6fcb90284eb7,771720,Ductal and Lobular Neoplasms,Solid Tissue Normal,18875.0,white,not hispanic or latino
3,a58fd36f-7b98-4494-96f0-4cdd740aeec7,762727,Ductal and Lobular Neoplasms,Primary Tumor,18875.0,white,not hispanic or latino
4,7d074940-ffaf-4463-adc2-be13c5fc4ed0,770970,Ductal and Lobular Neoplasms,Primary Tumor,21793.0,white,not hispanic or latino


### Descarga de datos 

In [23]:
# Descarga
download_gdc_data(base_path, "tcga", list(df['file_id']))

In [28]:
# Descomprimir datos descargados
shutil.unpack_archive(f"{base_path}/tcga.tar.gz", unpack_folder)

### Concatenación y guardado de datos

In [15]:
# Concatenación de archivos de texto
df_gdc = concatenate_gdc_files(unpack_folder)

In [16]:
df_gdc.head()

Unnamed: 0,cg22501393,cg18895155,cg27126442,cg15264255,cg18464559,cg20379125,cg20226593,cg12790134,cg07697569,cg13613532,...,cg23207527,cg23348028,cg20880234,cg15207619,cg10265786,cg08096038,cg05535113,cg26848248,cg09906309,file_id
0,0.043723,0.014719,0.234412,0.08656,0.016786,0.010598,,0.191185,0.016985,0.024959,...,0.945499,0.024654,0.013582,0.048369,0.007553,0.936774,0.130438,0.025019,0.0292,0141a32b-7851-4f58-9d94-19ebf1115c7b
1,0.037681,0.01729,0.279233,0.139277,0.016733,0.017398,,0.013403,0.022282,0.025937,...,0.896225,0.017131,0.025367,0.06789,0.011028,0.943059,0.226756,0.024223,0.141889,0182f6dd-be4d-41a4-ada1-713c3a8d549d
2,0.028953,0.016993,0.291327,0.123915,0.036519,0.017709,,0.201241,0.014975,0.047024,...,0.671114,0.037306,0.019064,0.076381,0.008196,0.917268,0.190455,0.031017,0.049397,0186a5ba-c3a8-485e-b837-103bfc5e6851
3,0.028778,0.017441,0.11183,0.036918,0.012441,0.010421,0.842198,0.015595,0.022527,0.025213,...,0.162775,0.019388,0.023751,0.063533,0.012443,0.880001,,0.034587,0.115159,0290c02d-12bf-430e-b165-5b13cb771393
4,0.030593,0.017929,0.11772,0.054345,0.017583,0.298795,,0.016743,0.023766,0.204609,...,0.139146,0.015526,0.032074,0.070412,0.013096,0.915391,0.135071,0.026909,0.239473,03decfe2-21f8-4e52-9b61-6a3ebb539e45


In [None]:
df_gdc = df_gdc.merge(df ,on='file_id')
df_gdc.head()
# df_samples.to_csv('df_gdc_27k.csv',index='False', sep='\t')

### Método alternativo de descarga GDC

Si en algún momento el portal GDC se cae, se puede utilizar el siguiente metodo para descargar una versión ya procesada de los datos

In [23]:
files = {
         concatenated_path:'1S8drqgJSejzeJaE3OnbRoue50A7SZ0sw'
         }
df_gdc = download_files_from_gd(files)

Downloading...
From (uriginal): https://drive.google.com/uc?id=1S8drqgJSejzeJaE3OnbRoue50A7SZ0sw
From (redirected): https://drive.google.com/uc?id=1S8drqgJSejzeJaE3OnbRoue50A7SZ0sw&confirm=t&uuid=89938bda-d555-42b4-be4e-164ccbd0bed7
To: c:\Users\drago\Documents\GitHub\Proyecto_Metilacion_ADN\data\raw\brca_27k.csv
100%|██████████| 158M/158M [00:03<00:00, 43.3MB/s] 


In [14]:
#module_path = str(Path.cwd())
#print(base_path)

# in jupyter (lab / notebook), based on notebook path
module_path = str(Path.cwd())

# in standard python
#module_path = str(Path.cwd(__file__).parents[0] / "py")

print(module_path)

c:\Users\drago\Documents\GitHub\Proyecto_Metilacion_ADN


## Obtención de manifiesto Illumina

In [7]:
# Obtener manifiesto de maquina Illumina 450k
url = 'https://webdata.illumina.com/downloads/productfiles/humanmethylation450/humanmethylation450_15017482_v1-2.csv'
url_27k = "https://support.illumina.com/content/dam/illumina-support/documents/downloads/productfiles/humanmethylation27/productsupportfiles/illumina_humanmethylation27_content.xlsx"
manifest = pd.read_csv(url, skiprows = 7)
manifest.head()

  manifest = pd.read_csv(url, skiprows = 7)


Unnamed: 0,IlmnID,Name,AddressA_ID,AlleleA_ProbeSeq,AddressB_ID,AlleleB_ProbeSeq,Infinium_Design_Type,Next_Base,Color_Channel,Forward_Sequence,...,UCSC_RefGene_Group,UCSC_CpG_Islands_Name,Relation_to_UCSC_CpG_Island,Phantom,DMR,Enhancer,HMM_Island,Regulatory_Feature_Name,Regulatory_Feature_Group,DHS
0,cg00035864,cg00035864,31729416,AAAACACTAACAATCTTATCCACATAAACCCTTAAATTTATCTCAA...,,,II,,,AATCCAAAGATGATGGAGGAGTGCCCGCTCATGATGTGAAGTACCT...,...,TSS1500,,,,,,,,,
1,cg00050873,cg00050873,32735311,ACAAAAAAACAACACACAACTATAATAATTTTTAAAATAAATAAAC...,31717405.0,ACGAAAAAACAACGCACAACTATAATAATTTTTAAAATAAATAAAC...,I,A,Red,TATCTCTGTCTGGCGAGGAGGCAACGCACAACTGTGGTGGTTTTTG...,...,Body;TSS1500,chrY:9363680-9363943,N_Shore,,,,Y:9973136-9976273,,,
2,cg00061679,cg00061679,28780415,AAAACATTAAAAAACTAATTCACTACTATTTAATTACTTTATTTTC...,,,II,,,TCAACAAATGAGAGACATTGAAGAACTAATTCACTACTATTTGGTT...,...,Body;Body;Body,,,,,,,,,
3,cg00063477,cg00063477,16712347,TATTCTTCCACACAAAATACTAAACRTATATTTACAAAAATACTTC...,,,II,,,CTCCTGTACTTGTTCATTAAATAATGATTCCTTGGATATACCAAGT...,...,Body,chrY:22737825-22738052,S_Shelf,,,,,,,
4,cg00121626,cg00121626,19779393,AAAACTAATAAAAATAACTTACAAACCAAATACTATACCCTACAAC...,,,II,,,AGGTGAATGAAGAGACTAATGGGAGTGGCTTGCAAGCCAGGTACTG...,...,Body,chrY:21664481-21665063,N_Shore,,,,,,,


In [10]:
# Guardar manifiesto
manifest.to_csv(manifest_path, index=False)