In [None]:
import pandas as pd
import numpy as np
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
filePath = '../data/TCGA2021/data_mrna_seq_v2_rsem.txt'
filePath2 = '../data/TCGA2021/data_rppa.txt'

In [None]:
df = pd.read_csv(filePath, sep = '\t')
df.head()
print(df.shape)

In [None]:
df2 = pd.read_csv(filePath2, sep = '\t')
df2.head()
print(df2.shape)

In [None]:
df2.head()

In [None]:
df.set_index('Entrez_Gene_Id', inplace=True)
df.drop('Hugo_Symbol', axis=1, inplace=True)
df.columns.name = 'Sample ID'
df.head()

In [None]:
intersection = set(df.columns).intersection(set(df2.columns))
print(set(df.columns).intersection(set(df2.columns)))
print(f'There are {df.shape[1]} samples in the RNA dataset and {df2.shape[1]} samples in the RPPA dataset. The overlap is size {len(intersection)}')

In [None]:
df = df.transpose()
nRows, nCols = df.shape
print('Rows:', nRows, 'Columns:', nCols)
mem = df.memory_usage(deep=True).sum() / (1024**2)
print(f'Memory MB: {mem:.2f}')
display(df.head())

In [None]:
df.dtypes

In [None]:
sampleSize = 25
randomColumnSample = np.random.choice(nCols, sampleSize, replace=False)

subDf = df.iloc[:,randomColumnSample]

subDf.describe()

In [None]:
subDf.info()

No numbers appear to be missing...

In [None]:
constantCols = df.columns[df.var(axis=0) < 1e-10]
numConstantCols = len(constantCols)
print(f'There are {numConstantCols} columns (out of {nCols} total, so {round(numConstantCols/nCols*100, 2)}%) that have constant value; they will be dropped.')

In [None]:
df = df.drop(columns = constantCols)
nCols = df.shape[1]
print(f'There are {nCols} columns left')

In [None]:
variances = df.var(axis=0)
nTopGenes = 100
mostVariantGenes=variances.nlargest(nTopGenes).index
print(f'The most "informative" genes were {mostVariantGenes[:5]}')

In [None]:
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()
df_scaled = pd.DataFrame(std_scaler.fit_transform(df), index=df.index, columns=df.columns)

subDf_scaled = df_scaled.loc[:,mostVariantGenes]

subDf_scaled.describe()

In [None]:
plt.figure(figsize=(nTopGenes, 50))
cmap = 'vlag'
sns.heatmap(subDf_scaled, cmap=cmap)

plt.show()

In [None]:

cmap = 'vlag'
sns.clustermap(subDf_scaled, cmap=cmap)

plt.show()

In [None]:
from sklearn.decomposition import PCA

PCA2D = PCA(n_components=2)
X = PCA2D.fit_transform(df_scaled)
X = pd.DataFrame(X, index=df.index, columns = ['PCA1', 'PCA2'])

plt.scatter(X.iloc[:, 0], X.iloc[:, 1])   
plt.xlabel(X.columns[0])
plt.ylabel(X.columns[1])
plt.show()

In [None]:
PCA2D.explained_variance_ratio_

In [None]:
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster

In [None]:
PCA = PCA(n_components=0.95)
X = pd.DataFrame(PCA.fit_transform(df_scaled), index=df_scaled.index)

PCA.n_components_

In [None]:
sampleHierarchy = linkage(X, method='ward')

plt.figure(figsize=(15, 4))
dendrogram(sampleHierarchy)
plt.show()

In [None]:
from scipy.spatial.distance import pdist, squareform

from scipy.cluster.hierarchy import leaves_list

D = squareform(pdist(X, metric='euclidean'))  # or another metric
sampleOrder = leaves_list(sampleHierarchy)
D_reordered = D[np.ix_(sampleOrder, sampleOrder)]

plt.figure(figsize=(15,15))

plt.imshow(D_reordered, cmap='viridis')
plt.title('Distance Heatmap (Clustered Order)')
plt.colorbar()

plt.show()

In [None]:
geneDF = df_scaled.transpose()
print(geneDF.shape)
geneDF.head()

In [None]:
topGeneDF = geneDF.loc[mostVariantGenes,:]
topGeneDF.shape

In [None]:
geneHierarchy = linkage(topGeneDF, method='ward')

plt.figure(figsize=(15, 4))
dendrogram(geneHierarchy)
plt.show()

In [None]:
geneOrder = leaves_list(geneHierarchy)
smallDF = topGeneDF.transpose()
print(smallDF.shape)


In [None]:
geneOrder[:5]

In [None]:
max(geneOrder)

In [None]:
smallDF_reordered = smallDF.iloc[sampleOrder, geneOrder]

plt.figure(figsize=(20,10))

plt.imshow(smallDF_reordered, cmap='viridis')
plt.xlabel('Genes')
plt.ylabel('Samples')
plt.colorbar()

plt.show()
plt.show()

In [None]:
sns.clustermap(
    smallDF,
    metric='euclidean', method='ward',
    cmap='bwr',
    figsize=(8, 8)
)