# Limpieza y obtención de genes

En el siguiente notebook se limpian los datos de perfil de metilación, se mapean los sitios CpG a genes y se guardan los datos ya limpios

## Librerías y paths

In [29]:
import sys
from pathlib import Path
import os

src_path = str(Path.cwd().parents[0] / "src")
if src_path not in sys.path:
    sys.path.append(src_path)

data_raw_path = str(Path.cwd().parents[0] / "data" / "raw")
data_path = str(Path.cwd().parents[0] / "data" )

from gene_preprocessing import get_gene_df
from get_data import create_folders
import pandas as pd

In [6]:
df_gdc = pd.read_csv(f"{data_raw_path}/brca_27k.csv", sep="\t")
df_manifest = pd.read_csv(f"{data_raw_path}/manifest.csv")

  df_manifest = pd.read_csv(f"{data_raw_path}/manifest.csv")


## Limpieza y mapeo

In [31]:
# Quitar edades NAN
df_gdc  = df_gdc.dropna(subset = ['age'])
# Quitar columnas con na
df_gdc  = df_gdc.dropna(axis=1)
# Quitar datos de metastasis
df_gdc = df_gdc[df_gdc['sample_type']!='Metastatic']
df_gdc = df_gdc.drop(columns = ['Unnamed: 0'])

In [17]:
# Dejar sólo columnas importantes, quitar aquellas filas que no aparecen en Illumina 27k
manifest_no_na = df_manifest[["IlmnID", "UCSC_RefGene_Name", "Methyl27_Loci"]].dropna()
# Crear mapeo gen->lista de CpG
mapping = manifest_no_na[["IlmnID", "UCSC_RefGene_Name"]].groupby(by="UCSC_RefGene_Name").agg(lambda x: x.tolist())
mapping = mapping.reset_index()

In [18]:
# Obtener mapeo
df_gene = get_gene_df(df_gdc, mapping)
# Limpiar columnas NAN 
df_gene = df_gene.dropna(axis=1)

In [24]:
# Concatenarle variables de interés
df_gene = pd.concat([df_gene, df_gdc[['sample_type', 'age', 'race', 'etnia']]], axis =1)

In [32]:
data_preprocessed_path = f"{data_path}/preprocessed"
create_folders(data_preprocessed_path)
# Guardar datos
df_gene.to_csv(f"{data_preprocessed_path}/gene_preprocessed.csv", index=False)
df_gdc.to_csv(f"{data_preprocessed_path}/cpg_preprocessed.csv", index=False)