In [2]:
import pandas as pd
import sys
import os
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats
from pydeseq2.default_inference import DefaultInference
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

sys.path.append(os.path.abspath("../../"))
from src.utils.ConvertTextToCsv import TextToCsv
from src.utils.Preprocessing import classify_cancer_type, elimnation_zeros

In [3]:
df_clincal_data = pd.read_csv("../../data/raw/brca_tcga_pub2015_clinical_data.tsv", sep='\t')
print(len(df_clincal_data.index))

print(f"Number of rows: {df_clincal_data.shape[0]} and number of columns: {df_clincal_data.shape[1]}")

818
Number of rows: 818 and number of columns: 110


In [4]:
df_clincal_data["Cancer Type"].unique()

array(['Invasive Breast Carcinoma'], dtype=object)

In [5]:
df_clincal_data["ER Status By IHC"].head(20)

0     Positive
1     Positive
2     Positive
3     Positive
4     Negative
5     Positive
6     Positive
7     Negative
8     Positive
9     Positive
10    Negative
11    Positive
12    Positive
13    Negative
14    Negative
15    Positive
16    Negative
17    Negative
18    Positive
19    Negative
Name: ER Status By IHC, dtype: object

In [6]:
df_clincal_data["PR status by ihc"].head(10)

0    Negative
1    Positive
2    Positive
3    Positive
4    Positive
5    Positive
6    Positive
7    Negative
8    Negative
9    Positive
Name: PR status by ihc, dtype: object

In [7]:
df_clincal_data["HER2 ihc score"].dropna().head(20)

2     1.0
4     2.0
6     2.0
7     0.0
8     3.0
10    0.0
14    2.0
15    2.0
16    2.0
17    1.0
18    2.0
20    3.0
21    1.0
23    1.0
24    0.0
27    2.0
29    2.0
30    2.0
31    2.0
32    1.0
Name: HER2 ihc score, dtype: float64

In [8]:
print(df_clincal_data["HER2 fish status"].isna().sum() / len(df_clincal_data))
print(df_clincal_data["HER2 ihc score"].isna().sum() / len(df_clincal_data))


0.6149144254278729
0.4315403422982885


In [9]:
df_clincal_data["HER2 ihc score"].unique()

array([nan,  1.,  2.,  0.,  3.])

In [10]:
list_df = classify_cancer_type(df_clinical_data=df_clincal_data)

luminal_A = [x for x in list_df if x == "Luminal A"]
luminal_B = [x for x in list_df if x == "Luminal B"]
HER2_enriched = [x for x in list_df if x == "HER2-enriched"]
TNBC = [x for x in list_df if x == "TNBC"]
UNK = [x for x in list_df if x == "<UNK>"]

print(f"Luminal A: {len(luminal_A)} - Total(%): {len(luminal_A) / len(df_clincal_data):.2f}")
print(f"Luminal B: {len(luminal_B)} - Total(%):{len(luminal_B) / len(df_clincal_data):.2f}")
print(f"HER2-enriched: {len(HER2_enriched)} - Total(%):{len(HER2_enriched) / len(df_clincal_data):.2f}")
print(f"TNBC: {len(TNBC)} - Total(%){len(TNBC) / len(df_clincal_data):.2f} ")
print(f"UNK: {len(UNK)} - Total(%) {len(UNK) / len(df_clincal_data):.2f}")

df_clincal_data["Tumor-Cancer"] = list_df



Luminal A: 330 - Total(%): 0.40
Luminal B: 81 - Total(%):0.10
HER2-enriched: 23 - Total(%):0.03
TNBC: 85 - Total(%)0.10 
UNK: 299 - Total(%) 0.37


In [11]:
df_clincal_data["Tumor-Cancer"].unique()

array(['<UNK>', 'Luminal A', 'TNBC', 'Luminal B', 'HER2-enriched'],
      dtype=object)

In [12]:
#Luminal A 
luminal_a_dataset = df_clincal_data[df_clincal_data["Tumor-Cancer"] == "Luminal A"]

#Luminal B
luminal_b_dataset = df_clincal_data[df_clincal_data["Tumor-Cancer"] == "Luminal B"]

In [13]:
df_mRNA = pd.read_csv("../../data/raw/data_mrna_seq_v2_rsem.txt", header=None)

In [14]:
df_mRNA_transformed = TextToCsv("../../data/raw/data_mrna_seq_v2_rsem.txt")

Shape of the CSV: (20440, 819)


Clincal Data Set

In [15]:
df_clincal_data

Unnamed: 0,Study ID,Patient ID,Sample ID,Diagnosis Age,American Joint Committee on Cancer Metastasis Stage Code,Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code,Neoplasm Disease Stage American Joint Committee on Cancer Code,American Joint Committee on Cancer Publication Version Type,American Joint Committee on Cancer Tumor Stage Code,Brachytherapy first reference point administered total dose,...,Staging System,Staging System.1,Surgery for positive margins,Surgery for positive margins other,Surgical procedure first,Tissue Source Site,TMB (nonsynonymous),Person Neoplasm Status,Tumor Disease Anatomic Site,Tumor-Cancer
0,brca_tcga_pub2015,TCGA-A1-A0SB,TCGA-A1-A0SB-01,70.0,M0,N0,Stage I,6th,T1c,,...,Sentinel node biopsy alone,,,,Lumpectomy,A1,0.600000,TUMOR FREE,Breast,<UNK>
1,brca_tcga_pub2015,TCGA-A1-A0SD,TCGA-A1-A0SD-01,59.0,M0,N0,Stage IIA,6th,T2,,...,Sentinel lymph node biopsy plus axillary disse...,,Lumpectomy,,Lumpectomy,A1,1.000000,,Breast,<UNK>
2,brca_tcga_pub2015,TCGA-A1-A0SE,TCGA-A1-A0SE-01,56.0,M0,N0 (i-),Stage I,6th,T1c,,...,Sentinel lymph node biopsy plus axillary disse...,,,,Modified Radical Mastectomy,A1,0.700000,TUMOR FREE,Breast,Luminal A
3,brca_tcga_pub2015,TCGA-A1-A0SF,TCGA-A1-A0SF-01,54.0,M0,N0,Stage IIA,6th,T2,,...,Sentinel lymph node biopsy plus axillary disse...,,,,Modified Radical Mastectomy,A1,1.266667,TUMOR FREE,Breast,<UNK>
4,brca_tcga_pub2015,TCGA-A1-A0SH,TCGA-A1-A0SH-01,39.0,M0,N0 (i-),Stage IIA,6th,T2,,...,Sentinel node biopsy alone,,,,Lumpectomy,A1,2.600000,TUMOR FREE,Breast,<UNK>
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
813,brca_tcga_pub2015,TCGA-MS-A51U,TCGA-MS-A51U-01,44.0,M0,N1,Stage IIB,7th,T2,,...,,,,,Modified Radical Mastectomy,MS,0.433333,WITH TUMOR,Breast,Luminal A
814,brca_tcga_pub2015,TCGA-OL-A66H,TCGA-OL-A66H-01,,MX,N1mi,Stage IB,7th,T1c,,...,,,,,Lumpectomy,OL,0.700000,TUMOR FREE,Breast,Luminal A
815,brca_tcga_pub2015,TCGA-OL-A66I,TCGA-OL-A66I-01,36.0,MX,N1mi,Stage IIA,6th,T1c,,...,Sentinel node biopsy alone,,,,Lumpectomy,OL,0.966667,TUMOR FREE,Breast,TNBC
816,brca_tcga_pub2015,TCGA-OL-A66J,TCGA-OL-A66J-01,80.0,MX,N0,Stage I,6th,T1c,,...,Sentinel node biopsy alone,,,,Lumpectomy,OL,1.733333,TUMOR FREE,Breast,Luminal A


mRNA seq

In [16]:
df_mRNA_transformed = df_mRNA_transformed.drop(columns=["Hugo_Symbol", "Entrez_Gene_Id"], axis=0)


In [17]:
df_mrna = df_mRNA_transformed.T.reset_index()

In [18]:
df_mRNA_final = df_mrna.rename(columns={"index":"Sample ID"})
df_mRNA_final

Unnamed: 0,Sample ID,0,1,2,3,4,5,6,7,8,...,20430,20431,20432,20433,20434,20435,20436,20437,20438,20439
0,TCGA-A1-A0SB-01,14.3935,116.3870,279.7612,0.4505,0.0,0.9010,0.9010,1.8020,0.0000,...,95.9568,519.4279,1415.9252,19.3716,1364.5681,6186.7327,1931.2986,1436.1978,552.3144,0.0000
1,TCGA-A1-A0SD-01,11.3241,60.2630,83.6986,0.3308,0.0,0.6616,0.3308,4.6315,0.3308,...,96.2700,578.2814,1225.7051,33.0825,868.0837,3559.6725,1278.9678,1195.6000,86.0144,0.0000
2,TCGA-A1-A0SE-01,4.4426,153.1452,74.7018,0.0000,0.0,0.0000,0.9872,5.5944,0.3291,...,95.4340,726.6146,1018.8400,57.5895,960.5923,3007.8157,926.3677,1075.4422,866.1456,0.0000
3,TCGA-A1-A0SF-01,10.7401,141.1933,314.4482,0.0000,0.0,0.0000,2.9988,9.4249,0.0000,...,74.1138,533.3625,1053.4444,94.6772,881.2262,5343.4779,934.3482,508.0867,52.2652,0.8568
4,TCGA-A1-A0SH-01,3.0048,79.8003,95.7054,0.0000,0.0,0.0000,0.3612,3.9727,0.0000,...,87.7601,581.0946,801.3977,19.8634,1353.2389,5464.9614,1312.7898,1001.1151,455.7746,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
812,TCGA-MS-A51U-01,7.9343,150.6834,540.4278,1.5649,0.0,0.5216,0.0000,6.2598,0.0000,...,79.2906,443.9228,1083.4637,40.6886,858.1116,4930.6208,2212.8326,665.6234,249.8696,0.0000
813,TCGA-OL-A66H-01,2.2519,115.3378,338.1985,2.0851,0.0,0.0000,2.0851,4.1701,0.0000,...,45.4545,747.7064,903.6697,10.4254,634.2786,4163.4696,1251.8766,773.1443,5.8382,0.0000
814,TCGA-OL-A66I-01,1.2603,158.3599,210.7460,0.4173,0.0,2.0866,2.5039,2.5039,0.4173,...,61.3459,334.2723,933.1247,206.1555,479.0819,6621.1789,649.3479,780.3860,47.1570,0.0000
815,TCGA-OL-A66J-01,5.0428,124.6327,323.1185,0.4507,0.0,0.0000,8.1118,5.8585,3.1546,...,45.5160,797.6566,1038.3055,13.9703,683.1906,5216.7643,1408.7427,713.3844,567.3727,0.0000


In [19]:
df_merged = pd.merge(df_mRNA_final, df_clincal_data, right_on="Sample ID", left_on="Sample ID")


In [26]:
comparation_df = df_merged.loc[
    df_merged["Tumor-Cancer"].isin(["Luminal A", "Luminal B"]),
    ["Tumor-Cancer"] + list(df_merged.columns[1:20441].astype(int))
]
comparation_df

Unnamed: 0,Tumor-Cancer,0,1,2,3,4,5,6,7,8,...,20430,20431,20432,20433,20434,20435,20436,20437,20438,20439
2,Luminal A,4.4426,153.1452,74.7018,0.0000,0.0,0.0000,0.9872,5.5944,0.3291,...,95.4340,726.6146,1018.8400,57.5895,960.5923,3007.8157,926.3677,1075.4422,866.1456,0.0
6,Luminal A,4.9419,134.8733,63.6488,0.3658,0.0,0.0000,0.3658,4.7554,0.0000,...,83.7677,2731.4129,834.3850,51.2117,1092.6383,3032.0988,958.3905,718.7929,43.5299,0.0
8,Luminal B,7.6484,119.4760,80.8081,0.3157,0.0,0.0000,0.3157,5.6818,0.6313,...,60.6061,546.0859,1014.2045,39.4571,1410.0379,4570.0758,1182.4495,1454.8611,1169.5076,0.0
12,Luminal A,6.9810,108.0428,219.0967,1.5848,0.0,0.3962,0.3962,10.3011,1.1886,...,66.9572,368.8590,1082.4089,44.3740,786.4501,2560.6181,1076.0697,895.0079,93.1062,0.0
15,Luminal A,11.5874,119.8406,262.2476,1.8111,0.0,0.0000,0.7244,10.5044,0.7244,...,106.4928,3220.5017,1341.3022,80.0507,813.1848,2985.4206,652.3590,442.2711,228.1989,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
811,Luminal A,14.1288,117.0300,307.2124,0.9974,0.0,0.0000,0.4987,3.9898,0.9974,...,77.3018,540.1153,1191.9442,345.6139,838.3507,2721.5227,1247.8010,1093.1973,25.9335,0.0
812,Luminal A,7.9343,150.6834,540.4278,1.5649,0.0,0.5216,0.0000,6.2598,0.0000,...,79.2906,443.9228,1083.4637,40.6886,858.1116,4930.6208,2212.8326,665.6234,249.8696,0.0
813,Luminal A,2.2519,115.3378,338.1985,2.0851,0.0,0.0000,2.0851,4.1701,0.0000,...,45.4545,747.7064,903.6697,10.4254,634.2786,4163.4696,1251.8766,773.1443,5.8382,0.0
815,Luminal A,5.0428,124.6327,323.1185,0.4507,0.0,0.0000,8.1118,5.8585,3.1546,...,45.5160,797.6566,1038.3055,13.9703,683.1906,5216.7643,1408.7427,713.3844,567.3727,0.0


In [21]:

[df_merged["Cancer Type"] == "Luminal A"]

[0      False
 1      False
 2      False
 3      False
 4      False
        ...  
 812    False
 813    False
 814    False
 815    False
 816    False
 Name: Cancer Type, Length: 817, dtype: bool]

In [72]:
print(f"Genes: {df_merged.shape[0]}, Samples: {df_merged.shape[1]}")

Genes: 817, Samples: 20551


In [91]:
df_merged.iloc[:, 0:1000]

Unnamed: 0,Sample ID,0,1,2,3,4,5,6,7,8,...,989,990,991,992,993,994,995,996,997,998
0,TCGA-A1-A0SB-01,14.3935,116.3870,279.7612,0.4505,0.0,0.9010,0.9010,1.8020,0.0000,...,417.1641,503.2098,2766.9783,229.3051,865.8633,631.1522,0.9010,3448.1361,1569.9966,2816.5334
1,TCGA-A1-A0SD-01,11.3241,60.2630,83.6986,0.3308,0.0,0.6616,0.3308,4.6315,0.3308,...,818.4600,430.7336,891.2414,349.6816,1112.2322,325.2006,0.3308,7102.4729,1361.0123,4622.9427
2,TCGA-A1-A0SE-01,4.4426,153.1452,74.7018,0.0000,0.0,0.0000,0.9872,5.5944,0.3291,...,1243.9325,563.0605,873.7145,180.3373,954.9979,349.1567,0.0000,11790.3743,1303.4965,4170.4648
3,TCGA-A1-A0SF-01,10.7401,141.1933,314.4482,0.0000,0.0,0.0000,2.9988,9.4249,0.0000,...,1177.6815,504.2310,622.4704,279.3191,787.4058,146.9424,0.0000,8007.2914,1721.3256,3676.5596
4,TCGA-A1-A0SH-01,3.0048,79.8003,95.7054,0.0000,0.0,0.0000,0.3612,3.9727,0.0000,...,726.2779,434.4666,1771.0924,365.1253,1195.0540,304.8128,0.3612,7596.8453,1450.0278,3095.8002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
812,TCGA-MS-A51U-01,7.9343,150.6834,540.4278,1.5649,0.0,0.5216,0.0000,6.2598,0.0000,...,926.9692,256.6510,1356.8075,143.9750,862.2848,169.0141,0.0000,5928.0125,1917.0579,3804.3818
813,TCGA-OL-A66H-01,2.2519,115.3378,338.1985,2.0851,0.0,0.0000,2.0851,4.1701,0.0000,...,778.5655,316.0967,650.9591,50.8757,972.4771,74.6455,0.0000,5029.1910,2597.9983,2889.0742
814,TCGA-OL-A66I-01,1.2603,158.3599,210.7460,0.4173,0.0,2.0866,2.5039,2.5039,0.4173,...,454.4601,341.7840,2139.1758,59.2593,1370.8920,280.8555,0.0000,6234.7418,3260.9285,4223.6828
815,TCGA-OL-A66J-01,5.0428,124.6327,323.1185,0.4507,0.0,0.0000,8.1118,5.8585,3.1546,...,1296.9806,360.5228,526.3632,253.2672,910.7706,140.6039,0.0000,5505.1825,2958.0892,4957.6386
