In [17]:
import pandas as pd
import sys
import os
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats
from pydeseq2.default_inference import DefaultInference
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from src.utils.ConvertTextToCsv import TextToCsv

sys.path.append(os.path.abspath("../../"))
from src.utils.Preprocessing import classify_cancer_type

In [18]:
df_clincal_data = pd.read_csv("../../data/raw/brca_tcga_pub2015_clinical_data.tsv", sep='\t')
print(len(df_clincal_data.index))

print(f"Number of rows: {df_clincal_data.shape[0]} and number of columns: {df_clincal_data.shape[1]}")

818
Number of rows: 818 and number of columns: 110


In [19]:
df_clincal_data["Cancer Type"].unique()

array(['Invasive Breast Carcinoma'], dtype=object)

In [20]:
df_clincal_data["ER Status By IHC"].head(20)

0     Positive
1     Positive
2     Positive
3     Positive
4     Negative
5     Positive
6     Positive
7     Negative
8     Positive
9     Positive
10    Negative
11    Positive
12    Positive
13    Negative
14    Negative
15    Positive
16    Negative
17    Negative
18    Positive
19    Negative
Name: ER Status By IHC, dtype: object

In [21]:
df_clincal_data["PR status by ihc"].head(10)

0    Negative
1    Positive
2    Positive
3    Positive
4    Positive
5    Positive
6    Positive
7    Negative
8    Negative
9    Positive
Name: PR status by ihc, dtype: object

In [22]:
df_clincal_data["HER2 ihc score"].dropna().head(20)

2     1.0
4     2.0
6     2.0
7     0.0
8     3.0
10    0.0
14    2.0
15    2.0
16    2.0
17    1.0
18    2.0
20    3.0
21    1.0
23    1.0
24    0.0
27    2.0
29    2.0
30    2.0
31    2.0
32    1.0
Name: HER2 ihc score, dtype: float64

In [23]:
print(df_clincal_data["HER2 fish status"].isna().sum() / len(df_clincal_data))
print(df_clincal_data["HER2 ihc score"].isna().sum() / len(df_clincal_data))


0.6149144254278729
0.4315403422982885


In [24]:
df_clincal_data["HER2 ihc score"].unique()

array([nan,  1.,  2.,  0.,  3.])

In [25]:
list_df = classify_cancer_type(df_clinical_data=df_clincal_data)

luminal_A = [x for x in list_df if x == "Luminal A"]
luminal_B = [x for x in list_df if x == "Luminal B"]
HER2_enriched = [x for x in list_df if x == "HER2-enriched"]
TNBC = [x for x in list_df if x == "TNBC"]
UNK = [x for x in list_df if x == "<UNK>"]

print(f"Luminal A: {len(luminal_A)} - Total(%): {len(luminal_A) / len(df_clincal_data):.2f}")
print(f"Luminal B: {len(luminal_B)} - Total(%):{len(luminal_B) / len(df_clincal_data):.2f}")
print(f"HER2-enriched: {len(HER2_enriched)} - Total(%):{len(HER2_enriched) / len(df_clincal_data):.2f}")
print(f"TNBC: {len(TNBC)} - Total(%){len(TNBC) / len(df_clincal_data):.2f} ")
print(f"UNK: {len(UNK)} - Total(%) {len(UNK) / len(df_clincal_data):.2f}")

df_clincal_data["Tumor-Cancer"] = list_df



Luminal A: 330 - Total(%): 0.40
Luminal B: 81 - Total(%):0.10
HER2-enriched: 23 - Total(%):0.03
TNBC: 85 - Total(%)0.10 
UNK: 299 - Total(%) 0.37


In [26]:
df_clincal_data["Tumor-Cancer"].unique()

array(['<UNK>', 'Luminal A', 'TNBC', 'Luminal B', 'HER2-enriched'],
      dtype=object)

In [27]:
#Luminal A 
luminal_a_dataset = df_clincal_data[df_clincal_data["Tumor-Cancer"] == "Luminal A"]

#Luminal B
luminal_b_dataset = df_clincal_data[df_clincal_data["Tumor-Cancer"] == "Luminal B"]

In [28]:
df_mRNA = pd.read_csv("../../data/raw/data_mrna_seq_v2_rsem.txt", header=None)

In [30]:
df_mRNA_transformed = TextToCsv("../../data/raw/data_mrna_seq_v2_rsem.txt")

Shape of the CSV: (20440, 819)


Clincal Data Set

In [32]:
df_clincal_data

Unnamed: 0,Study ID,Patient ID,Sample ID,Diagnosis Age,American Joint Committee on Cancer Metastasis Stage Code,Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code,Neoplasm Disease Stage American Joint Committee on Cancer Code,American Joint Committee on Cancer Publication Version Type,American Joint Committee on Cancer Tumor Stage Code,Brachytherapy first reference point administered total dose,...,Staging System,Staging System.1,Surgery for positive margins,Surgery for positive margins other,Surgical procedure first,Tissue Source Site,TMB (nonsynonymous),Person Neoplasm Status,Tumor Disease Anatomic Site,Tumor-Cancer
0,brca_tcga_pub2015,TCGA-A1-A0SB,TCGA-A1-A0SB-01,70.0,M0,N0,Stage I,6th,T1c,,...,Sentinel node biopsy alone,,,,Lumpectomy,A1,0.600000,TUMOR FREE,Breast,<UNK>
1,brca_tcga_pub2015,TCGA-A1-A0SD,TCGA-A1-A0SD-01,59.0,M0,N0,Stage IIA,6th,T2,,...,Sentinel lymph node biopsy plus axillary disse...,,Lumpectomy,,Lumpectomy,A1,1.000000,,Breast,<UNK>
2,brca_tcga_pub2015,TCGA-A1-A0SE,TCGA-A1-A0SE-01,56.0,M0,N0 (i-),Stage I,6th,T1c,,...,Sentinel lymph node biopsy plus axillary disse...,,,,Modified Radical Mastectomy,A1,0.700000,TUMOR FREE,Breast,Luminal A
3,brca_tcga_pub2015,TCGA-A1-A0SF,TCGA-A1-A0SF-01,54.0,M0,N0,Stage IIA,6th,T2,,...,Sentinel lymph node biopsy plus axillary disse...,,,,Modified Radical Mastectomy,A1,1.266667,TUMOR FREE,Breast,<UNK>
4,brca_tcga_pub2015,TCGA-A1-A0SH,TCGA-A1-A0SH-01,39.0,M0,N0 (i-),Stage IIA,6th,T2,,...,Sentinel node biopsy alone,,,,Lumpectomy,A1,2.600000,TUMOR FREE,Breast,<UNK>
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
813,brca_tcga_pub2015,TCGA-MS-A51U,TCGA-MS-A51U-01,44.0,M0,N1,Stage IIB,7th,T2,,...,,,,,Modified Radical Mastectomy,MS,0.433333,WITH TUMOR,Breast,Luminal A
814,brca_tcga_pub2015,TCGA-OL-A66H,TCGA-OL-A66H-01,,MX,N1mi,Stage IB,7th,T1c,,...,,,,,Lumpectomy,OL,0.700000,TUMOR FREE,Breast,Luminal A
815,brca_tcga_pub2015,TCGA-OL-A66I,TCGA-OL-A66I-01,36.0,MX,N1mi,Stage IIA,6th,T1c,,...,Sentinel node biopsy alone,,,,Lumpectomy,OL,0.966667,TUMOR FREE,Breast,TNBC
816,brca_tcga_pub2015,TCGA-OL-A66J,TCGA-OL-A66J-01,80.0,MX,N0,Stage I,6th,T1c,,...,Sentinel node biopsy alone,,,,Lumpectomy,OL,1.733333,TUMOR FREE,Breast,Luminal A


mRNA seq

In [33]:
df_mRNA_transformed

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,TCGA-A1-A0SB-01,TCGA-A1-A0SD-01,TCGA-A1-A0SE-01,TCGA-A1-A0SF-01,TCGA-A1-A0SH-01,TCGA-A1-A0SI-01,TCGA-A1-A0SJ-01,TCGA-A1-A0SK-01,...,TCGA-LL-A5YM-01,TCGA-LL-A5YN-01,TCGA-LL-A5YO-01,TCGA-LL-A5YP-01,TCGA-LQ-A4E4-01,TCGA-MS-A51U-01,TCGA-OL-A66H-01,TCGA-OL-A66I-01,TCGA-OL-A66J-01,TCGA-OL-A66K-01
0,UBE2Q2P2,100134869,14.3935,11.3241,4.4426,10.7401,3.0048,2.9782,4.9419,28.8560,...,0.0000,2.9624,3.6899,6.3020,14.1288,7.9343,2.2519,1.2603,5.0428,4.3892
1,HMGB1P1,10357,116.3870,60.2630,153.1452,141.1933,79.8003,63.5491,134.8733,1119.1932,...,101.2865,100.3083,278.5626,206.4376,117.0300,150.6834,115.3378,158.3599,124.6327,106.3475
2,LOC155060,155060,279.7612,83.6986,74.7018,314.4482,95.7054,149.7940,63.6488,166.7192,...,130.9387,367.2936,165.7717,103.9783,307.2124,540.4278,338.1985,210.7460,323.1185,554.5539
3,RNU12-2P,26823,0.4505,0.3308,0.0000,0.0000,0.0000,0.2943,0.3658,0.3152,...,0.0000,0.5225,0.0000,0.0000,0.9974,1.5649,2.0851,0.4173,0.4507,1.2434
4,SSX9,280660,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20435,ZYX,7791,6186.7327,3559.6725,3007.8157,5343.4779,5464.9614,4515.3031,3032.0988,591.2386,...,5421.4750,5774.8171,6722.2911,11514.4665,2721.5227,4930.6208,4163.4696,6621.1789,5216.7643,4961.5584
20436,ZZEF1,23140,1931.2986,1278.9678,926.3677,934.3482,1312.7898,1010.5945,958.3905,1798.9285,...,368.3607,964.4723,1176.9241,1154.1591,1247.8010,2212.8326,1251.8766,649.3479,1408.7427,1027.8728
20437,ZZZ3,26009,1436.1978,1195.6000,1075.4422,508.0867,1001.1151,959.3879,718.7929,1335.6445,...,150.8088,508.3595,606.3610,494.1230,1093.1973,665.6234,773.1443,780.3860,713.3844,855.8699
20438,TPTEP1,387590,552.3144,86.0144,866.1456,52.2652,455.7746,30.6062,43.5299,49.4800,...,30.5694,17.2414,207.0770,73.6890,25.9335,249.8696,5.8382,47.1570,567.3727,37.3018
