In [31]:
import pandas as pd
import numpy as np
import sys
import os

from inmoose import limma

#Find the utils Directory
sys.path.append(os.path.abspath("../../"))
from src.utils.ConvertTextToCsv import TextToCsv

sys.path.append(os.path.abspath("../../"))
from src.utils.Preprocessing import elimnation_zeros, total_type_len_type_cancer

%matplotlib inline

<font size="4">Get the DataFrame from the clinical Data</font>

In [32]:
df_clincal_data = pd.read_csv("../../data/raw/brca_tcga_pub2015_clinical_data.tsv", sep='\t')
list_df = total_type_len_type_cancer(df_clincal_data)
df_clincal_data["Tumor-Cancer"] = list_df

Luminal A: 330 - Total(%): 0.40
Luminal B: 81 - Total(%):0.10
HER2-enriched: 23 - Total(%):0.03
TNBC: 85 - Total(%)0.10 
UNK: 299 - Total(%) 0.37


In [33]:
df_clincal_data["Tumor-Cancer"].unique()

array(['<UNK>', 'Luminal A', 'TNBC', 'Luminal B', 'HER2-enriched'],
      dtype=object)

<font size="4">Get the DataFrame from the mRNA-Seq Data</font>

In [34]:
df_mRNA = pd.read_csv("../../data/raw/data_mrna_seq_v2_rsem.txt", header=None)
df_mRNA_transformed = TextToCsv("../../data/raw/data_mrna_seq_v2_rsem.txt")

Shape of the CSV: (20440, 819)


<font size="4">Transform the mRNA-Seq for merging the two DataFrames</font>

In [35]:
df_mRNA_transformed = df_mRNA_transformed.drop(columns=["Hugo_Symbol", "Entrez_Gene_Id"], axis=0)
df_mrna = df_mRNA_transformed.T.reset_index()
df_mRNA_final = df_mrna.rename(columns={"index":"Sample ID"})

<font size="4">Merging the clinical DataSet and mRNA-Seq</font>

In [36]:
df_merged = pd.merge(df_mRNA_final, df_clincal_data, right_on="Sample ID", left_on="Sample ID")

<font size="4">Getting the information of the columns of the mRNA-Seq and classes</font>

In [37]:
comparation_df = df_merged.loc[
    df_merged["Tumor-Cancer"].isin(["Luminal A", "Luminal B", "TNBC", "HER2-enriched"]),
    ["Tumor-Cancer"] + list(df_merged.columns[1:20441])
]
comparation_df["Tumor-Cancer"].unique()

array(['Luminal A', 'TNBC', 'Luminal B', 'HER2-enriched'], dtype=object)

In [38]:
zeros_reduced_df = elimnation_zeros(comparation_df)

Max of zeros per row in the dataset: 519
Avg of zeros per row in the dataset: 74.00381604696673
Median of zeros per row in the dataset: 0.0
Min of zeros per row in the dataset: 0
After the 0 elimination: 16270


In [39]:
print(f"Samples: {zeros_reduced_df.shape[0]}, Genes: {zeros_reduced_df.shape[1]}")

Samples: 519, Genes: 16270


<font size="4">Transformation of Log2 of the data</font>

In [40]:
metadata = pd.DataFrame(zeros_reduced_df["Tumor-Cancer"], index=zeros_reduced_df.index)
metadata.columns = ["Tumor_Cancer"]
counts_data = zeros_reduced_df.drop(columns=["Tumor-Cancer"])

counts_data = np.log2(counts_data + 1)
expr = counts_data.T
print(f"Genes {counts_data.shape[0]}, Samples {counts_data.shape[1]}")
print("metadata rows:", metadata.shape[0])
print("counts_data rows:", counts_data.shape[0])
print("counts_data cols:", counts_data.shape[1])
(expr.columns == metadata.index).all()

Genes 519, Samples 16269
metadata rows: 519
counts_data rows: 519
counts_data cols: 16269


np.True_

In [41]:
metadata_aligned = metadata.loc[expr.columns].copy()
dummies = pd.get_dummies(metadata_aligned["Tumor_Cancer"]).astype(float)
design = dummies
print(design.shape)
print(design.columns)

dummies

(519, 4)
Index(['HER2-enriched', 'Luminal A', 'Luminal B', 'TNBC'], dtype='object')


Unnamed: 0,HER2-enriched,Luminal A,Luminal B,TNBC
2,0.0,1.0,0.0,0.0
6,0.0,1.0,0.0,0.0
7,0.0,0.0,0.0,1.0
8,0.0,0.0,1.0,0.0
10,0.0,0.0,0.0,1.0
...,...,...,...,...
812,0.0,1.0,0.0,0.0
813,0.0,1.0,0.0,0.0
814,0.0,0.0,0.0,1.0
815,0.0,1.0,0.0,0.0


<font size="4">Limma functions</font>

In [42]:
#This is for fitting the models
limma_fit_models = limma.lmFit(obj=expr, design=design)

#Emperical moderate Bayes (eBayes)
limma_fit_models = limma.eBayes(limma_fit_models)

#Obtain the table of Results
results = limma.topTable(limma_fit_models, number=np.inf)

#Transform to pandas dataframe
results_df = pd.DataFrame(results)
results_df.head()

Unnamed: 0,column0,column1,column2,column3,AveExpr,F,pvalue,adj_pvalue
10188,2.741544,3.980984,3.308898,2.849358,3.635831,859.160812,0.0,0.0
10189,11.317054,12.082887,11.65501,10.554771,11.731901,30101.080167,0.0,0.0
56,9.727526,11.141363,10.757791,9.867774,10.81026,16125.787712,0.0,0.0
10190,7.473603,7.675518,7.483152,7.561101,7.617808,20132.518332,0.0,0.0
58,7.258109,8.073024,7.716143,7.730618,7.925134,6153.749519,0.0,0.0
