# End-to-end prediction of (synthetic) transcriptome from sparse genes

## Imports

In [None]:
import numpy as np
import pandas as pd

from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.preprocessing import scale

from scipy.stats import pearsonr, spearmanr, kendalltau

import geneselection.solvers.elasticnet.pca as epca
import geneselection.solvers.elasticnet.utils as eutils
from geneselection.datasets.correlated_random_variables import hub_spoke_data
from geneselection.utils.data import tidy

import altair as alt
alt.data_transformers.enable("default", max_rows=None)

## Generate synthetic data

In [None]:
adata_all = hub_spoke_data(n_samples=20000,
                           n_groups=100,
                           group_size=50,
                           n_singeltons=1500,
                           diagonal_weight=1/np.e,
                           off_diagonal_weight=1)
adata_all.X = adata_all.X.astype(np.float64)
adata_all.var.index = adata_all.var.index.astype(str)

### Split off a train and test set

In [None]:
adata = adata_all[:15000,:].copy()
adata_test = adata_all[15000:,:].copy()

## Select predictive genes with elastic net PCA

In [None]:
pca = PCA(n_components=250, svd_solver="randomized")
pca.fit(adata.X)

In [None]:
df = pd.DataFrame(list(enumerate(pca.explained_variance_ratio_)))
df.columns = ["PC", "Explained Variance Ratio"]

alt.Chart(df).mark_point().encode(
    x="PC",
    y="Explained Variance Ratio",
)

### Parameters

In [None]:
params = dict(lambda_path = np.geomspace(1, 0.01, num=100),    # lambda path
              alpha = 0.9,                                     # fraction of regularization devoted to L1 prenalty
              n_pcs = 2,                                       # number of pcs to predit with multitask elastic net
              pc_weights = "scaled",                           # relative importance in predicting pcs
              n_bootstraps = 100,                              # number of bootstrap replicates
              n_processes = 25,                                # number of parallel processes to use
              thresholds = np.linspace(0.01, 1, num=100))      # selection thresholds for including genes

### Run bootstrap replicates

In [None]:
results = epca.parallel_runs(adata,
                             n_processes=params["n_processes"],
                             n_bootstraps=params["n_bootstraps"],
                             n_pcs=params["n_pcs"],
                             alpha=params["alpha"],
                             lambda_path=params["lambda_path"],
                             pc_weights=params["pc_weights"])

### Inspect results

In [None]:
eutils.thresh_lambda_plot(results,
                          adata,
                          thresholds=params["thresholds"],
                          lambdas=params["lambda_path"])

In [None]:
eutils.hub_persistence_plot(adata, results)

### Pick maximally informative sparse genes

In [None]:
predictive_genes = eutils.get_selected_genes(results,
                                             adata,
                                             lambda_index=70,
                                             selection_threshold_index=70,
                                             thresholds=params["thresholds"])

### see how well we selected genes

In [None]:
print(classification_report(np.array(adata.var["Type"] == "hub"),
                            np.isin(adata.var.index, predictive_genes)))

### See how well we predict PCs using selected genes

#### Fit pca to training data

In [None]:
pca = PCA(n_components=params["n_pcs"], svd_solver="randomized")
pca.fit(adata.X)

#### pcs are our targets to regress

In [None]:
y_train = pca.transform(adata.X)
y_train = scale(y_train)

#### regressors are the sparse genes

In [None]:
X_train = adata.X[:,predictive_genes.astype(int)]

#### fit the regression

In [None]:
reg = LinearRegression()
reg.fit(X_train, y_train)

#### predict on the training data

In [None]:
y_pred_train = reg.predict(X_train)

#### predict on the test and train data

In [None]:
y_test = pca.transform(adata_test.X)
y_test = scale(y_test)

X_test = adata_test.X[:,predictive_genes.astype(int)]
y_pred_test = reg.predict(X_test)

#### organize data

In [None]:
df_test_real = tidy(y_test).loc[:,1:]
df_test_real.columns = ["PC", "Real Value"]
df_test_real["PC"] += 1

df_test_pred = tidy(y_pred_test).loc[:,1:]
df_test_pred.columns = ["PC", "Predicted Value"]
df_test_pred["PC"] += 1

df_test = pd.concat([df_test_real, df_test_pred], axis=1)
df_test = df_test.iloc[:,[0,1,3]]
df_test["Split"] = "Test"

df_train_real = tidy(y_train).loc[:,1:]
df_train_real.columns = ["PC", "Real Value"]
df_train_real["PC"] += 1

df_train_pred = tidy(y_pred_train).loc[:,1:]
df_train_pred.columns = ["PC", "Predicted Value"]
df_train_pred["PC"] += 1

df_train = pd.concat([df_train_real, df_train_pred], axis=1)
df_train = df_train.iloc[:,[0,1,3]]
df_train["Split"] = "Train"

df_pc = pd.concat([df_test, df_train], ignore_index=True)

In [None]:
alt.Chart(df_pc, width=400, height=400).mark_circle(size=10).encode(
    x='Real Value',
    y='Predicted Value'
).facet(
    column='Split:N',
    row='PC:N'
)

In [None]:
print(pearsonr(y_test[:,0],y_pred_test[:,0])[0])
print(pearsonr(y_test[:,1],y_pred_test[:,1])[0])

## Model rest of genes using sparse gene set

### Subsets of genes based on tyoe

In [None]:
hub_genes = np.array(adata.var["Type"] == "hub")
spoke_genes = np.array(adata.var["Type"] == "spoke")
singleton_genes = np.array(adata.var["Type"] == "singleton")

### Fit regression

In [None]:
ridge = Ridge(alpha=1.0, tol=0.00001)
ridge.fit(X_train, adata.X)

y_pred_all_train = ridge.predict(X_train)
y_pred_all_test = ridge.predict(X_test)

### Organize results

In [None]:
perf_train = np.array([pearsonr(y_pred_all_train[:,i], adata.X[:,i])[0] for i in range(adata.X.shape[1])])
perf_test = np.array([pearsonr(y_pred_all_test[:,i], adata_test.X[:,i])[0] for i in range(adata_test.X.shape[1])])

df_perf_train = adata.var.copy()
df_perf_train["Pearson Correlation"] = perf_train
df_perf_train["Split"] = "Train"

df_perf_test = adata.var.copy()
df_perf_test["Pearson Correlation"] = perf_test
df_perf_test["Split"] = "Test"

df_perf = pd.concat([df_perf_train, df_perf_test])

df_perf_plot = df_perf.copy()
df_perf_plot["Pearson Correlation"] = df_perf_plot["Pearson Correlation"] + 0.001*np.random.randn(len(df_perf_plot["Pearson Correlation"]))

### Correlation histogram

In [None]:
alt.Chart(df_perf_plot).mark_area(
    opacity=0.5,
    interpolate='step'
).encode(
    alt.X("Pearson Correlation", bin=alt.Bin(maxbins=100), scale=alt.Scale(domain=[-0.1, 1.1])),
    alt.Y('count()', stack=None),
    alt.Color('Type')
).facet(
    column='Split:N'
)

### Correlation by gene index / type

In [None]:
df_perf_plot["Gene Index"] = df_perf_plot.index.values

alt.Chart(df_perf_plot).mark_point().encode(
    x="Gene Index:Q",
    y="Pearson Correlation",
    color="Type"
).facet(
    column='Split:N'
)