## Load data

In [63]:
import numpy as np

from src.random_forest.train import load_data, _get_metadata_columns
from pathlib import Path

base_dir = Path("..", "..", "..", "data")
radiomics_file_name = "radiomics_features_2022_12_10_11_20_08.csv"
radiomics_train_file_name = f"train_{radiomics_file_name}"
radiomics_test_file_name = f"test_{radiomics_file_name}"

train_dataframe = load_data(Path(base_dir, radiomics_file_name))

train_dataframe = train_dataframe.drop(_get_metadata_columns(train_dataframe), axis=1)


## PCA

In [42]:
import plotly.graph_objs as go
import plotly.offline as pyo

from sklearn.decomposition import PCA

data = []

pca = PCA(n_components=2)
pca.fit(train_dataframe)
transformed_values = pca.components_.T

print(f"Explained variance: {pca.explained_variance_}")

for group in [1, 2, 3]:
    df_group_idx = list(train_dataframe[train_dataframe['label'] == group].index)

    trace = go.Scatter(x=transformed_values[df_group_idx, 0],
                       y=transformed_values[df_group_idx, 1],
                       mode='markers',
                       name=group)
    data.append(trace)

# Layout of the plot
layout = go.Layout(title='Grouping')
fig = go.Figure(data=data, layout=layout)
fig.show()


'temp-plot.html'

## Correlation features with label

In [54]:
import pandas as pd
import plotly.express as px

df = train_dataframe.corr()[["label"]]
# df = pd.concat([df] * 10000, axis=1)
# fig = px.imshow(df)
fig = px.bar(df.iloc[1:])
pyo.plot(fig)

'temp-plot.html'

## Correlations between vars distribution

In [67]:
df = train_dataframe.corr()
fig = px.histogram(np.reshape(df.to_numpy(), -1))
pyo.plot(fig)

'temp-plot.html'