# Limpieza y obtención de genes

En el siguiente notebook se limpian los datos de perfil de metilación, se mapean los sitios CpG a genes y se guardan los datos ya limpios

## Librerías y paths

In [1]:
import sys
from pathlib import Path
import os

src_path = str(Path.cwd().parents[0] / "src")
if src_path not in sys.path:
    sys.path.append(src_path)

data_raw_path = str(Path.cwd().parents[0] / "data" / "raw")
data_path = str(Path.cwd().parents[0] / "data" )

from gene_preprocessing import get_gene_df
from get_data import create_folders
import pandas as pd

In [2]:
df_gdc = pd.read_csv(f"{data_raw_path}/brca_27k.csv", sep="\t")
df_manifest = pd.read_csv(f"{data_raw_path}/manifest.csv")

  df_manifest = pd.read_csv(f"{data_raw_path}/manifest.csv")


## Limpieza y mapeo

In [3]:
# Quitar edades NAN
df_gdc  = df_gdc.dropna(subset = ['age'])
# Quitar columnas con na
df_gdc  = df_gdc.dropna(axis=1)
# Quitar datos de metastasis
df_gdc = df_gdc[df_gdc['sample_type']!='Metastatic']
df_gdc = df_gdc.drop(columns = ['Unnamed: 0'])

In [4]:
# Dejar sólo columnas importantes, quitar aquellas filas que no aparecen en Illumina 27k
manifest_no_na = df_manifest[["IlmnID", "UCSC_RefGene_Name", "Methyl27_Loci"]].dropna()
# Crear mapeo gen->lista de CpG
mapping = manifest_no_na[["IlmnID", "UCSC_RefGene_Name"]].groupby(by="UCSC_RefGene_Name").agg(lambda x: x.tolist())
mapping = mapping.reset_index()

In [5]:
# Obtener mapeo
df_gene = get_gene_df(df_gdc, mapping)
# Limpiar columnas NAN 
df_gene = df_gene.dropna(axis=1)

In [6]:
df_gene_median = get_gene_df(df_gdc, mapping, 'median')
df_gene_median = df_gene_median.dropna(axis=1)

In [7]:
df_gene.head()

Unnamed: 0,A2BP1;A2BP1,A2ML1,A2ML1;A2ML1,A4GALT,A4GNT;A4GNT,AAAS,AACS,AADAT;AADAT,AANAT;AANAT,AANAT;AANAT;AANAT,...,ZSCAN4,ZSWIM1,ZUFSP;ZUFSP,ZW10,ZW10;ZW10,ZWILCH;ZWILCH;RPL4;ZWILCH,ZWINT;ZWINT;ZWINT,ZYX;ZYX,ZZEF1;CYB5D2;CYB5D2;CYB5D2,ZZEF1;CYB5D2;CYB5D2;CYB5D2;CYB5D2
0,0.044423,0.768291,0.908337,0.252334,0.953219,0.086182,0.018864,0.036382,0.868332,0.538797,...,0.67222,0.025536,0.023444,0.069751,0.014844,0.028244,0.013301,0.083322,0.029899,0.024695
1,0.026814,0.132678,0.141399,0.266167,0.499529,0.018681,0.03735,0.020863,0.808839,0.52066,...,0.729969,0.039985,0.014283,0.062165,0.010062,0.047341,0.02003,0.023045,0.03425,0.011238
2,0.083193,0.635636,0.863555,0.288977,0.93438,0.029737,0.037866,0.046084,0.884297,0.727605,...,0.963057,0.027186,0.013825,0.045726,0.013635,0.029947,0.018374,0.018324,0.031341,0.012968
3,0.015045,0.194414,0.547587,0.320244,0.448468,0.053222,0.020214,0.030711,0.936687,0.928537,...,0.596809,0.02762,0.014695,0.0412,0.00689,0.03863,0.011707,0.06371,0.037068,0.014655
4,0.086755,0.289458,0.529573,0.346638,0.65576,0.0171,0.038643,0.031061,0.78524,0.478493,...,0.683186,0.030621,0.0151,0.038555,0.01124,0.037962,0.019641,0.010095,0.049518,0.014894


In [8]:
df_gene_median.head()

Unnamed: 0,A2BP1;A2BP1,A2ML1,A2ML1;A2ML1,A4GALT,A4GNT;A4GNT,AAAS,AACS,AADAT;AADAT,AANAT;AANAT,AANAT;AANAT;AANAT,...,ZSCAN4,ZSWIM1,ZUFSP;ZUFSP,ZW10,ZW10;ZW10,ZWILCH;ZWILCH;RPL4;ZWILCH,ZWINT;ZWINT;ZWINT,ZYX;ZYX,ZZEF1;CYB5D2;CYB5D2;CYB5D2,ZZEF1;CYB5D2;CYB5D2;CYB5D2;CYB5D2
0,0.044423,0.768291,0.908337,0.252334,0.953219,0.086182,0.018864,0.036382,0.868332,0.538797,...,0.67222,0.025536,0.023444,0.069751,0.014844,0.028244,0.013301,0.083322,0.029899,0.024695
1,0.026814,0.132678,0.141399,0.266167,0.499529,0.018681,0.03735,0.020863,0.808839,0.52066,...,0.729969,0.039985,0.014283,0.062165,0.010062,0.047341,0.02003,0.023045,0.03425,0.011238
2,0.083193,0.635636,0.863555,0.288977,0.93438,0.029737,0.037866,0.046084,0.884297,0.727605,...,0.963057,0.027186,0.013825,0.045726,0.013635,0.029947,0.018374,0.018324,0.031341,0.012968
3,0.015045,0.194414,0.547587,0.320244,0.448468,0.053222,0.020214,0.030711,0.936687,0.928537,...,0.596809,0.02762,0.014695,0.0412,0.00689,0.03863,0.011707,0.06371,0.037068,0.014655
4,0.086755,0.289458,0.529573,0.346638,0.65576,0.0171,0.038643,0.031061,0.78524,0.478493,...,0.683186,0.030621,0.0151,0.038555,0.01124,0.037962,0.019641,0.010095,0.049518,0.014894


In [9]:
# Concatenarle variables de interés
df_gene = pd.concat([df_gene, df_gdc[['sample_type', 'age', 'race', 'etnia']]], axis =1)
df_gene_median = pd.concat([df_gene_median, df_gdc[['sample_type', 'age', 'race', 'etnia']]], axis =1)

In [10]:
data_preprocessed_path = f"{data_path}/preprocessed"
create_folders(data_preprocessed_path)
# Guardar datos
df_gene.to_csv(f"{data_preprocessed_path}/gene_preprocessed.csv", index=False)
df_gene_median.to_csv(f"{data_preprocessed_path}/gene_preprocessed_median.csv", index=False)
df_gdc.to_csv(f"{data_preprocessed_path}/cpg_preprocessed.csv", index=False)