##### Principal component analysis
This notebook is a PCA reproductoin of Figure 9 in the paper `Diagnostic potential for a serum miRNA neural network for detection of ovarian cancer` by Kevin M Elias. This is a PCA visualization of supplement dataset 1 in elife-28932-supp6-v2.xlsx (Supplementary file 6 of the original paper, which is the TPM data from miRNA sequencing), and the dataset has been standardized,

Source: https://elifesciences.org/articles/28932#fig9

In [1]:
import plotly.express as px
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

df = pd.read_csv("../data/PCA_raw.csv")
df = df.transpose()
df = df.drop(['miRNA'])

###
scaler.fit(df)
df = pd.DataFrame(scaler.transform(df), columns = df.columns)
###

NECC_list = [1, 28, 29, 34, 38, 46, 52, 76, 78, 83, 84, 91, 98, 103] 

PMP_NECC = []
for i in range(119):
    if (i+1) not in NECC_list:
        PMP_NECC.append('PMP')
    else:
        PMP_NECC.append('NECC')
ERASMOS = ['ERASMOS' for i in range(60)]

feature = []
feature.extend(PMP_NECC)
feature.extend(ERASMOS)

df["class"] = feature
X = df.loc[:, df.columns != 'class']

In [2]:
pca = PCA(n_components=3)
components = pca.fit_transform(X)

total_var = pca.explained_variance_ratio_.sum() * 100

fig = px.scatter_3d(
    components, x=0, y=1, z=2, color=df['class'],
    title=f'Total Explained Variance: {total_var:.2f}%',
    labels={'0': 'Factor1', '1': 'Factor 2', '2': 'Factor 3'}
)
fig.show()