In [2]:
import pandas as pd
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import sys
import os

In [3]:

sys.path.append(os.path.abspath("../../"))
from src.utils.ConvertTextToCsv import TextToCsv

df_txt = pd.read_csv("../../data/raw/data_mrna_seq_v2_rsem.txt", header=None)
print(df_txt)
df_csv = TextToCsv("../../data/raw/data_mrna_seq_v2_rsem.txt")
print(df_csv.columns)


df_clean = df_csv.drop_duplicates(subset=['Hugo_Symbol'])
df_clean = df_clean.set_index('Hugo_Symbol')
df_clean = df_clean.drop(columns=['Entrez_Gene_Id'], errors='ignore')
df_counts = df_clean.T
df_counts.columns = df_counts.columns.astype(str).str.strip().str.upper()

genes_clave = ['ERBB2', 'ESR1', 'PGR', 'MKI67']
encontrados = [g for g in genes_clave if g in df_counts.columns]
print(f"Genes that are found: {encontrados}") 

# PCA
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_counts)
    
pca = PCA(n_components=4)
components = pca.fit_transform(X_scaled)
    

def classification_per_row(row):
    her2 = row.get('ERBB2', 0)
    esr1 = row.get('ESR1', 0)
    ki67 = row.get('MKI67', 0)
        
    th_her2 = df_counts['ERBB2'].median() if 'ERBB2' in df_counts else 0
    th_esr1 = df_counts['ESR1'].median() if 'ESR1' in df_counts else 0
    th_ki67 = df_counts['MKI67'].median() if 'MKI67' in df_counts else 0

    if her2 > th_her2:
        return 'HER2-Enriched'
    elif esr1 > th_esr1:
        if ki67 > th_ki67:
            return 'Luminal B'
        else:
            return 'Luminal A'
    else:
        return 'Triple Negativo'


df_plot = pd.DataFrame(components, columns=['0', '1', '2', '3'])
df_plot['Subtype'] = df_counts.apply(classification_per_row, axis=1).values
df_plot['Patient_ID'] = df_counts.index


labels = {str(i): f"PC {i+1} ({var:.1f}%)" for i, var in enumerate(pca.explained_variance_ratio_ * 100)}


fig = px.scatter_matrix(
    df_plot,
    dimensions=['0', '1', '2'],
    color='Subtype',
    labels=labels,
    hover_name='Patient_ID',
    title="PCA de Subtipos de Cáncer de Mama",
    opacity=0.7,
    color_discrete_map={
        'Luminal A': '#1f77b4', 'Luminal B': '#aec7e8',
        'HER2-Enriched': '#ff7f0e', 'Triple Negativo': '#d62728'
    }
)
fig.update_traces(diagonal_visible=False, marker=dict(size=5))
fig.show()


                                                       0
0      Hugo_Symbol\tEntrez_Gene_Id\tTCGA-A1-A0SB-01\t...
1      UBE2Q2P2\t100134869\t14.3935\t11.3241\t4.4426\...
2      HMGB1P1\t10357\t116.3870\t60.2630\t153.1452\t1...
3      LOC155060\t155060\t279.7612\t83.6986\t74.7018\...
4      RNU12-2P\t26823\t0.4505\t0.3308\t0.0000\t0.000...
...                                                  ...
20436  ZYX\t7791\t6186.7327\t3559.6725\t3007.8157\t53...
20437  ZZEF1\t23140\t1931.2986\t1278.9678\t926.3677\t...
20438  ZZZ3\t26009\t1436.1978\t1195.6000\t1075.4422\t...
20439  TPTEP1\t387590\t552.3144\t86.0144\t866.1456\t5...
20440  AKR1C6P\t389932\t0.0000\t0.0000\t0.0000\t0.856...

[20441 rows x 1 columns]
Shape of the CSV: (20440, 819)
Index(['Hugo_Symbol', 'Entrez_Gene_Id', 'TCGA-A1-A0SB-01', 'TCGA-A1-A0SD-01',
       'TCGA-A1-A0SE-01', 'TCGA-A1-A0SF-01', 'TCGA-A1-A0SH-01',
       'TCGA-A1-A0SI-01', 'TCGA-A1-A0SJ-01', 'TCGA-A1-A0SK-01',
       ...
       'TCGA-LL-A5YM-01', 'TCGA-LL-A5YN-01