In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.graph_objects as go
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from utils import *

In [2]:
gene_exp_df = read_dataframe_from_pickle("data/processed_data/gene_exp_data.pkl")
label_df = read_dataframe_from_pickle("data/processed_data/label_data.pkl")
print(
    "--" * 80
)
print(f"Entries in Gene Expression Dataframe : {len(gene_exp_df)}")
print(f"Entries in Label Dataframe : {len(label_df)}")

DataFrame successfully loaded from data/processed_data/gene_exp_data.pkl
DataFrame successfully loaded from data/processed_data/label_data.pkl
----------------------------------------------------------------------------------------------------------------------------------------------------------------
Entries in Gene Expression Dataframe : 5268
Entries in Label Dataframe : 5268


In [3]:
gene_exp_df.shape

(5268, 20531)

## Feature Analysis

In [22]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

X = gene_exp_df.copy()
# Standardize data first
X_std = StandardScaler().fit_transform(X)

# Fit PCA
pca = PCA().fit(X_std)

In [7]:

# Create interactive scree plot
fig = go.Figure()

# Individual explained variance
# fig.add_trace(go.Bar(
#     x=[f"PC{i+1}" for i in range(len(pca.explained_variance_ratio_))],
#     y=pca.explained_variance_ratio_,
#     name='Individual',
#     marker_color='orange'
# ))

# Cumulative explained variance
fig.add_trace(go.Scatter(
    x=[f"PC{i+1}" for i in range(len(pca.explained_variance_ratio_))],
    y=np.cumsum(pca.explained_variance_ratio_),
    name='Cumulative',
    mode='lines',
    line=dict(color='royalblue', width=2),
    marker=dict(size=8)
))

fig.add_hline(y=0.95, line_dash="dot", 
              annotation_text="95% variance", 
              annotation_position="bottom right")
fig.add_hline(y=0.90, line_dash="dot", 
              annotation_text="90% variance", 
              annotation_position="bottom right")
fig.add_hline(y=0.80, line_dash="dot", 
              annotation_text="80% variance", 
              annotation_position="bottom right")

# Customize layout
fig.update_layout(
    title='PCA Explained Variance Ratio',
    xaxis_title='Principal Components',
    yaxis_title='Explained Variance Ratio',
    hovermode="x unified",
    template='plotly_white',
    height=600
)

# Add range slider
# fig.update_xaxes(rangeslider_visible=True)

fig.show()

In [8]:
expvar_to_pc_map = {}
cumulative_exp = np.cumsum(pca.explained_variance_ratio_) *100
for e in range(80,96):
    pc_count = np.argmax(cumulative_exp >= e) + 1  
    
    # Store in the mapping
    expvar_to_pc_map[e] = pc_count
    
    print(f"To explain {e}% of variance, you need {pc_count} principal components")

To explain 80% of variance, you need 734 principal components
To explain 81% of variance, you need 791 principal components
To explain 82% of variance, you need 851 principal components
To explain 83% of variance, you need 916 principal components
To explain 84% of variance, you need 986 principal components
To explain 85% of variance, you need 1061 principal components
To explain 86% of variance, you need 1142 principal components
To explain 87% of variance, you need 1230 principal components
To explain 88% of variance, you need 1326 principal components
To explain 89% of variance, you need 1430 principal components
To explain 90% of variance, you need 1545 principal components
To explain 91% of variance, you need 1671 principal components
To explain 92% of variance, you need 1812 principal components
To explain 93% of variance, you need 1970 principal components
To explain 94% of variance, you need 2150 principal components
To explain 95% of variance, you need 2358 principal componen

## Variance Captured by Top 50 PCs

In [9]:
(pca.explained_variance_ratio_[:50]) * 100

array([6.65349001, 5.50403137, 4.04139775, 3.18562812, 2.61580839,
       2.05604336, 1.94300532, 1.75712432, 1.59406238, 1.38367242,
       1.14863487, 0.98503309, 0.93884767, 0.87451376, 0.83760321,
       0.80330434, 0.75712757, 0.72021914, 0.66241382, 0.60456553,
       0.5443689 , 0.51036982, 0.49236273, 0.47378658, 0.43881903,
       0.42278562, 0.41153948, 0.39332629, 0.37703444, 0.37476062,
       0.36418417, 0.35828266, 0.34628837, 0.33735326, 0.31667147,
       0.30448083, 0.3021877 , 0.29668273, 0.27322987, 0.25953146,
       0.25844408, 0.25038285, 0.24727699, 0.23699087, 0.23412336,
       0.22922272, 0.22651818, 0.22170806, 0.21613766, 0.21204371])

### The amount of information or variance each of these PCs are capturing is significantly low. So directly applying pca over this dataset is not recomended

### From this scree plot we can see to retrieve at least 85% information we need to shrink the dimensionality from 20k to 1k. But that dimentionality in itself is not desiarable

In [None]:
import plotly.express as px
import numpy as np

# Assuming pca is your fitted PCA object
explained_variance_ratio = pca.explained_variance_ratio_

# Create histogram
fig = px.histogram(
    x=explained_variance_ratio,
    nbins=20,
    title='Distribution of PCA Explained Variance Ratios',
    labels={'x': 'Explained Variance Ratio', 'y': 'Count'},
    opacity=0.7,
    color_discrete_sequence=['#636EFA']
)

# Add mean line
mean_val = np.mean(explained_variance_ratio)
fig.add_vline(
    x=mean_val, 
    line_dash="dash", 
    line_color="red",
    annotation_text=f"Mean: {mean_val:.3f}", 
    annotation_position="top right"
)

# Customize layout
fig.update_layout(
    plot_bgcolor='white',
    xaxis=dict(tickformat=".0%"),
    yaxis_title="Number of Components",
    showlegend=False
)

fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')

fig.show()

In [12]:
import plotly.figure_factory as ff

fig = ff.create_distplot(
    [explained_variance_ratio],
    ['Explained Variance'],
    colors=['#636EFA'],
    bin_size=0.01,
    show_rug=True,
    curve_type='kde'
)

fig.update_layout(
    title='Distribution of PCA Explained Variance Ratios',
    xaxis_title='Explained Variance Ratio',
    yaxis_title='Density',
    xaxis=dict(tickformat=".0%"),
    plot_bgcolor='white'
)

fig.show()

### From the distribution of explained variance across all features we can observe that only 10-15 features have explained variance more than 1%.

In [5]:
result_001_list = []
X = gene_exp_df.copy()
# Standardize data first
X_std = StandardScaler().fit_transform(X)

# Fit PCA
pca = PCA().fit(X_std)
explained_variance_list = pca.explained_variance_ratio_ * 100
i = 10
while i < 501:
    print("==" * 50)
    print(f"Starting for pc count : {i}")
    result_001 = {
        "no_of_pc":i,
        "max_explained_var": np.max(explained_variance_list[:i]),
        "min_explained_var": np.min(explained_variance_list[:i]),
        "avg_explained_var": np.mean(explained_variance_list[:i]),
        "median_explained_var": np.median(explained_variance_list[:i]),
        "std_explained_var": np.std(explained_variance_list[:i]),
        "total_explained_variance_captured": np.sum(explained_variance_list[:i])
    }
    result_001_list.append(result_001)
    i += 10
    print(f"Successfully Compiled the data for pc count : {i}")
del X,X_std,pca
result_001_df = pd.DataFrame(result_001_list)

Starting for pc count : 10
Successfully Compiled the data for pc count : 20
Starting for pc count : 20
Successfully Compiled the data for pc count : 30
Starting for pc count : 30
Successfully Compiled the data for pc count : 40
Starting for pc count : 40
Successfully Compiled the data for pc count : 50
Starting for pc count : 50
Successfully Compiled the data for pc count : 60
Starting for pc count : 60
Successfully Compiled the data for pc count : 70
Starting for pc count : 70
Successfully Compiled the data for pc count : 80
Starting for pc count : 80
Successfully Compiled the data for pc count : 90
Starting for pc count : 90
Successfully Compiled the data for pc count : 100
Starting for pc count : 100
Successfully Compiled the data for pc count : 110
Starting for pc count : 110
Successfully Compiled the data for pc count : 120
Starting for pc count : 120
Successfully Compiled the data for pc count : 130
Starting for pc count : 130
Successfully Compiled the data for pc count : 140
Sta

In [8]:
result_001_df.loc[:,["no_of_pc","total_explained_variance_captured"]]
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=result_001_df["no_of_pc"],
    y=result_001_df["total_explained_variance_captured"],
    name='Total Explained Variance Vs #PCs',
    mode='lines',
    line=dict(color='royalblue', width=2),
    marker=dict(size=8)
))

## Label Analysis

In [None]:
import plotly.express as px

# Get value counts
tumour_site_counts = label_df["tumour_site"].value_counts()

# Create the bar plot
fig = px.bar(
    x=tumour_site_counts.index,
    y=tumour_site_counts.values,
    labels={'x': 'Tumour Site', 'y': 'Count'},
    title='Distribution of Tumour Sites'
)

# Improve readability of long labels
fig.update_layout(
    xaxis_tickangle=-90,  # Rotate labels
    xaxis_tickfont=dict(size=10),  # Adjust font size if needed
    margin=dict(b=150),  # Increase bottom margin to prevent label cutoff
    height=600,  # Adjust height
    width=1000  # Adjust width
)

fig.show()

In [22]:

relevant_df = gene_exp_df.loc[label_df.case_id.tolist()] # filetering out the available_subset 
label_df.set_index(label_df["case_id"],inplace=True,drop=True)
sorted_labels = label_df["tumour_site"].sort_index()
sorted_gene_exp_df = gene_exp_df.sort_index()

In [44]:
label_df["tumour_site"].value_counts()

tumour_site
Kidney                                                                    739
Prostate_gland                                                            658
Bronchus_and_lung                                                         566
Bladder                                                                   427
Colon                                                                     331
Cervix_uteri                                                              307
Lymph_nodes                                                               283
Connective,_subcutaneous_and_other_soft_tissues                           203
Corpus_uteri                                                              196
Pancreas                                                                  182
Skin                                                                      175
Hematopoietic_and_reticuloendothelial_systems                             173
Other_and_unspecified_parts_of_tongue               

In [45]:
high_count_labels = sorted_labels.value_counts()[sorted_labels.value_counts() > 150].index

filtered_labels = sorted_labels[sorted_labels.isin(high_count_labels)]

In [47]:
print(len(filtered_labels.value_counts().values))

13


In [37]:
from sklearn.preprocessing import LabelEncoder
def encode_labels(label_series):
    # Initialize the encoder
    label_encoder = LabelEncoder()
    # Fit and transform the labels
    encoded_labels = label_encoder.fit_transform(label_series)
    return encoded_labels, label_encoder

In [39]:
encoded_label, label_encoder = encode_labels(filtered_labels)