<a href="https://colab.research.google.com/github/Dowell-Lab/psea/blob/main/notebook_examples/PSEA_metrics_on_simulated_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import plotly.express as px
import numpy as np

# Pull in the raw data

In [2]:
url="https://raw.githubusercontent.com/Dowell-Lab/psea/refs/heads/main/testdata/sim_psea_scores_20241015-122448.adjpval.csv"
resultsdf = pd.read_csv(url, index_col=0)


In [5]:
#which_pvalue_column = "p_value_BenjaminiHochberg"
#which_pvalue_column = "p_value_bonf"
#which_pvalue_column = "p_value_BenjaminiYekutieli"
which_pvalue_column = "p_value_holm"


# Label the simulated comorbidites as as TRUE or FALSE

## functions

In [80]:
#this code splits the column name to tell us which gene it was simulated from and what parmaters were used
def parse_simulated_binary_att(row):
  simulated_binary_attribute = row["binary_attribute"]
  # Split the simulated_binary_attribute string by underscores
  parts = simulated_binary_attribute.split('_')
  # Extract the gene name
  genename = "_".join(parts[0:4])
  # Extract the other values using a dictionary for easier parsing
  extracted_values = {}
  for part in parts[1:]:
      if 'Truesamplesize' in part:
          extracted_values['samples_true'] = int(part.replace('Truesamplesize', ''))
      elif 'biassamplesize' in part:
          extracted_values['samples_true_bias'] = int(part.replace('biassamplesize', ''))
      elif 'Zscorevaluebais' in part:
          extracted_values['Zscore_valuebais'] = float(part.replace('Zscorevaluebais', ''))
      elif 'sigma' in part:
          extracted_values['Zscore_valuebais_sigma'] = float(part.replace('sigma', ''))
      elif 'top' in part:
          extracted_values['top_or_bottom'] = "top"
      elif 'bottom' in part:
          extracted_values['top_or_bottom'] = "bottom"
      elif 'pba' in part:
          extracted_values['percent_binary_attributes_thatarevaluebias'] = float(part.replace('pba', ''))

  return genename, extracted_values


def mark_actul_TRUE_FALSE_links(df, min_people_with_bias_transcription=0):
  """ min_people_with_bias_transcription is a number. More than min_people_with_bias_transcription must have the bias for it to count as a TRUE gene-comorbid linkage"""
  #split the comorbid name into all its parts
  df[["genename", "other_dict"]] = df.apply(lambda row: parse_simulated_binary_att(row), axis=1, result_type="expand")
  #put its parts (now new columns) back on the orginal data frame
  final_df = pd.concat([resultsdf.drop(['other_dict'], axis=1), resultsdf['other_dict'].apply(pd.Series)], axis=1)
  final_df['Actual_Label'] = 'FALSE'  # Initialize all values to 'FP'
  # Create a boolean mask for rows where 'value' matches 'genename' and 'samples_true_bias' is not 0
  mask = (final_df['value'] == final_df['genename']) & (final_df['samples_true_bias'] > min_people_with_bias_transcription)
  # Set 'Actual_Label' to 'TRUE' for rows matching the mask
  final_df.loc[mask, 'Actual_Label'] = 'TRUE'
  return final_df

# functions for creating ROC curve and confusion matrix

In [89]:
 def confusion_matrix(df, cutoff=0.05):
  confusiondf = df[[which_pvalue_column, "Actual_Label"]].copy()
  confusiondf["Predicted_Label"] = np.where(confusiondf[which_pvalue_column] <= cutoff, "TRUE", "FALSE")
  cm = confusiondf.groupby(["Actual_Label", "Predicted_Label"]).size().reset_index(name="count")
  cm['Confusion_Category'] = 'Unknown'
  cm.loc[(cm.Actual_Label == 'FALSE') & (cm.Predicted_Label == 'TRUE'), 'Confusion_Category'] = 'False Positive'
  cm.loc[(cm.Actual_Label == 'TRUE') & (cm.Predicted_Label == 'TRUE'), 'Confusion_Category'] = 'True Positive'
  cm.loc[(cm.Actual_Label == 'TRUE') & (cm.Predicted_Label == 'FALSE'), 'Confusion_Category'] = 'False Negative'
  cm.loc[(cm.Actual_Label == 'FALSE') & (cm.Predicted_Label == 'FALSE'), 'Confusion_Category'] = 'True Negative'
  try:
    TP = cm.loc[cm.Confusion_Category == 'True Positive', 'count'].values[0]
  except:
    TP = 0
  try:
    FN = cm.loc[cm.Confusion_Category == 'False Negative', 'count'].values[0]
  except:
    FN = 0
  try:
    FP = cm.loc[cm.Confusion_Category == 'False Positive', 'count'].values[0]
  except:
    FP = 0
  try:
    TN = cm.loc[cm.Confusion_Category == 'True Negative', 'count'].values[0]
  except:
    TN = 0
  if TP + FN == 0:
    TPrate = 0
  else:
    TPrate = TP/(TP+FN)
  if FP + TN == 0:
    FPrate = 0
  else:
    FPrate = FP/(FP+TN)
  if cutoff==0.05:
    cmsummary = pd.DataFrame([[TP,FN],[FP,TN]])
    cmsummary.columns = ["Predicted_TRUE", "Predicted_FALSE"]
    cmsummary.index = ["Actual_TRUE", "Actual_FALSE"]
    print("cutoff", cutoff)
    print(cmsummary)
  return cm, TPrate, FPrate

def create_ROC_curve(df):
  TPrates = []
  FPrates = []
  cutoffs = [cutoff for cutoff in np.arange(0, 1.01, 0.01)]
  for cutoff in cutoffs:
    cm, TPrate, FPrate = confusion_matrix(df, cutoff=cutoff)
    TPrates.append(TPrate)
    FPrates.append(FPrate)
  ROCdf = pd.DataFrame({"TPrate": TPrates, "FPrate": FPrates, "cutoff": cutoffs})
  return ROCdf

def plot_ROC_curve(ROCdf, graphtitle="ROCcurve"):
  fig = px.line(ROCdf, x="FPrate", y="TPrate", title=graphtitle)
  fig.update_layout(xaxis_range=[0, 1], yaxis_range=[0, 1])
  fig.update_xaxes(title_text="False Positive Rate")
  fig.update_yaxes(title_text="True Positive Rate")
  fig.show()


def plot_ROCdf_single_set_parmaters(df, min_people_with_bias_transcription=0):
  final_df = mark_actul_TRUE_FALSE_links(df, min_people_with_bias_transcription=min_people_with_bias_transcription)
  ROCdf = create_ROC_curve(final_df)
  plot_ROC_curve(ROCdf)

In [90]:
final_df = mark_actul_TRUE_FALSE_links(resultsdf)

In [91]:
final_df

Unnamed: 0,binary_attribute,value,runpsea,NES,pval,p_value_bonf,p_value_holm,p_value_BenjaminiHochberg,p_value_BenjaminiYekutieli,genename,samples_true,samples_true_bias,Zscore_valuebais,Zscore_valuebais_sigma,top_or_bottom,percent_binary_attributes_thatarevaluebias,Actual_Label
49999,simulated_based_on_ENSG00000279648_Truesamples...,simulated_based_on_ENSG00000279648,included,16.701117,0.00000,0.0,0.0,0.00000,0.0,simulated_based_on_ENSG00000279648,141,113,3.0,0.5,bottom,0.8,TRUE
39711,simulated_based_on_ENSG00000156273_Truesamples...,simulated_based_on_ENSG00000156273,included,11.884497,0.00000,0.0,0.0,0.00000,0.0,simulated_based_on_ENSG00000156273,57,34,3.0,0.5,bottom,0.6,TRUE
39700,simulated_based_on_ENSG00000156265_Truesamples...,simulated_based_on_ENSG00000156265,included,9.513312,0.00000,0.0,0.0,0.00000,0.0,simulated_based_on_ENSG00000156265,57,34,3.0,0.5,bottom,0.6,TRUE
39599,simulated_based_on_ENSG00000279648_Truesamples...,simulated_based_on_ENSG00000279648,included,8.635665,0.00000,0.0,0.0,0.00000,0.0,simulated_based_on_ENSG00000279648,43,26,3.0,0.5,bottom,0.6,TRUE
39533,simulated_based_on_ENSG00000223692_Truesamples...,simulated_based_on_ENSG00000223692,included,7.599806,0.00000,0.0,0.0,0.00000,0.0,simulated_based_on_ENSG00000223692,43,26,3.0,0.5,bottom,0.6,TRUE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30102,simulated_based_on_ENSG00000156265_Truesamples...,simulated_based_on_ENSG00000170262,included,0.000000,0.96328,1.0,1.0,0.96328,1.0,simulated_based_on_ENSG00000156265,1,1,1.0,0.5,bottom,0.6,FALSE
4175,simulated_based_on_ENSG00000269950_Truesamples...,simulated_based_on_ENSG00000240755,included,0.000000,0.96328,1.0,1.0,0.96328,1.0,simulated_based_on_ENSG00000269950,1,0,2.0,0.5,bottom,0.0,FALSE
20197,simulated_based_on_ENSG00000279648_Truesamples...,simulated_based_on_ENSG00000269950,included,0.000000,0.96328,1.0,1.0,0.96328,1.0,simulated_based_on_ENSG00000279648,1,0,1.0,0.5,bottom,0.4,FALSE
35,simulated_based_on_ENSG00000223692_Truesamples...,simulated_based_on_ENSG00000240755,included,0.000000,0.96328,1.0,1.0,0.96328,1.0,simulated_based_on_ENSG00000223692,1,0,1.0,0.5,top,0.0,FALSE


In [92]:
plot_ROCdf_single_set_parmaters(resultsdf, min_people_with_bias_transcription=0)

cutoff 0.05
              Predicted_TRUE  Predicted_FALSE
Actual_TRUE             1413             2387
Actual_FALSE               4            46196


In [102]:
def evaluate_simulation_parameters_neccesary(df, min_people_with_bias_transcription=0):
  final_df = mark_actul_TRUE_FALSE_links(df, min_people_with_bias_transcription=min_people_with_bias_transcription)
  #samples_true 	samples_true_bias 	Zscore_valuebais 	Zscore_valuebais_sigma 	top_or_bottom 	percent_binary_attributes_thatarevaluebias
  n_comorbids = [v for v in sorted(final_df["samples_true"].unique())]
  n_comorbids_biases = [v for v in sorted(final_df["samples_true_bias"].unique()) if v!=0]
  Zscore_valuesbaises = [v for v in sorted(final_df["Zscore_valuebais"].unique())]
  toporbottom = [v for v in sorted(final_df["top_or_bottom"].unique())]
  for n_comorbid in n_comorbids:
    for n_comorbid_bias in n_comorbids_biases:
      for Zscore_valuebais in Zscore_valuesbaises:
        for direction in toporbottom:
          final_df_subset = final_df[(final_df["samples_true"] == n_comorbid) & (final_df["samples_true_bias"] == n_comorbid_bias) & (final_df["Zscore_valuebais"] == Zscore_valuebais)& (final_df["top_or_bottom"] == direction)]
          if final_df_subset.shape[0] != 0:
            direction = "low expression" if direction == "top" else "high expression"
            title = f"n_comorbid: {n_comorbid}, n_comorbid_bias: {n_comorbid_bias}, Zscore_valuebais: {Zscore_valuebais}, bais_direction: {direction}"
            print(title)
            ROCdf = create_ROC_curve(final_df_subset)
            plot_ROC_curve(ROCdf, graphtitle=title)

In [None]:
evaluate_simulation_parameters_neccesary(resultsdf)

n_comorbid: 1, n_comorbid_bias: 1, Zscore_valuebais: 1.0, bais_direction: high expression
cutoff 0.05
              Predicted_TRUE  Predicted_FALSE
Actual_TRUE                0               20
Actual_FALSE               0              180


n_comorbid: 1, n_comorbid_bias: 1, Zscore_valuebais: 1.0, bais_direction: low expression
cutoff 0.05
              Predicted_TRUE  Predicted_FALSE
Actual_TRUE                0               20
Actual_FALSE               0              180


n_comorbid: 1, n_comorbid_bias: 1, Zscore_valuebais: 1.5, bais_direction: high expression
cutoff 0.05
              Predicted_TRUE  Predicted_FALSE
Actual_TRUE                0               20
Actual_FALSE               0              180


n_comorbid: 1, n_comorbid_bias: 1, Zscore_valuebais: 1.5, bais_direction: low expression
cutoff 0.05
              Predicted_TRUE  Predicted_FALSE
Actual_TRUE                0               20
Actual_FALSE               0              180


n_comorbid: 1, n_comorbid_bias: 1, Zscore_valuebais: 2.0, bais_direction: high expression
cutoff 0.05
              Predicted_TRUE  Predicted_FALSE
Actual_TRUE                0               20
Actual_FALSE               0              180


n_comorbid: 1, n_comorbid_bias: 1, Zscore_valuebais: 2.0, bais_direction: low expression
cutoff 0.05
              Predicted_TRUE  Predicted_FALSE
Actual_TRUE                0               20
Actual_FALSE               0              180


n_comorbid: 1, n_comorbid_bias: 1, Zscore_valuebais: 2.5, bais_direction: high expression
cutoff 0.05
              Predicted_TRUE  Predicted_FALSE
Actual_TRUE                0               20
Actual_FALSE               0              180


n_comorbid: 1, n_comorbid_bias: 1, Zscore_valuebais: 2.5, bais_direction: low expression
cutoff 0.05
              Predicted_TRUE  Predicted_FALSE
Actual_TRUE                0               20
Actual_FALSE               0              180


n_comorbid: 1, n_comorbid_bias: 1, Zscore_valuebais: 3.0, bais_direction: high expression
cutoff 0.05
              Predicted_TRUE  Predicted_FALSE
Actual_TRUE                0               20
Actual_FALSE               0              180


n_comorbid: 1, n_comorbid_bias: 1, Zscore_valuebais: 3.0, bais_direction: low expression
cutoff 0.05
              Predicted_TRUE  Predicted_FALSE
Actual_TRUE                0               20
Actual_FALSE               0              180


n_comorbid: 4, n_comorbid_bias: 1, Zscore_valuebais: 1.0, bais_direction: high expression
cutoff 0.05
              Predicted_TRUE  Predicted_FALSE
Actual_TRUE                0               10
Actual_FALSE               0               90


n_comorbid: 4, n_comorbid_bias: 1, Zscore_valuebais: 1.0, bais_direction: low expression
cutoff 0.05
              Predicted_TRUE  Predicted_FALSE
Actual_TRUE                0               10
Actual_FALSE               0               90


n_comorbid: 4, n_comorbid_bias: 1, Zscore_valuebais: 1.5, bais_direction: high expression
cutoff 0.05
              Predicted_TRUE  Predicted_FALSE
Actual_TRUE                0               10
Actual_FALSE               0               90


n_comorbid: 4, n_comorbid_bias: 1, Zscore_valuebais: 1.5, bais_direction: low expression
cutoff 0.05
              Predicted_TRUE  Predicted_FALSE
Actual_TRUE                0               10
Actual_FALSE               0               90


n_comorbid: 4, n_comorbid_bias: 1, Zscore_valuebais: 2.0, bais_direction: high expression
cutoff 0.05
              Predicted_TRUE  Predicted_FALSE
Actual_TRUE                0               10
Actual_FALSE               0               90


n_comorbid: 4, n_comorbid_bias: 1, Zscore_valuebais: 2.0, bais_direction: low expression
cutoff 0.05
              Predicted_TRUE  Predicted_FALSE
Actual_TRUE                0               10
Actual_FALSE               0               90


n_comorbid: 4, n_comorbid_bias: 1, Zscore_valuebais: 2.5, bais_direction: high expression
cutoff 0.05
              Predicted_TRUE  Predicted_FALSE
Actual_TRUE                0               10
Actual_FALSE               0               90


n_comorbid: 4, n_comorbid_bias: 1, Zscore_valuebais: 2.5, bais_direction: low expression
cutoff 0.05
              Predicted_TRUE  Predicted_FALSE
Actual_TRUE                0               10
Actual_FALSE               0               90


n_comorbid: 4, n_comorbid_bias: 1, Zscore_valuebais: 3.0, bais_direction: high expression
cutoff 0.05
              Predicted_TRUE  Predicted_FALSE
Actual_TRUE                0               10
Actual_FALSE               0               90


n_comorbid: 4, n_comorbid_bias: 1, Zscore_valuebais: 3.0, bais_direction: low expression
cutoff 0.05
              Predicted_TRUE  Predicted_FALSE
Actual_TRUE                0               10
Actual_FALSE               0               90


n_comorbid: 4, n_comorbid_bias: 2, Zscore_valuebais: 1.0, bais_direction: high expression
cutoff 0.05
              Predicted_TRUE  Predicted_FALSE
Actual_TRUE                0               20
Actual_FALSE               0              180


n_comorbid: 4, n_comorbid_bias: 2, Zscore_valuebais: 1.0, bais_direction: low expression
cutoff 0.05
              Predicted_TRUE  Predicted_FALSE
Actual_TRUE                0               20
Actual_FALSE               0              180


n_comorbid: 4, n_comorbid_bias: 2, Zscore_valuebais: 1.5, bais_direction: high expression
cutoff 0.05
              Predicted_TRUE  Predicted_FALSE
Actual_TRUE                0               20
Actual_FALSE               0              180


n_comorbid: 4, n_comorbid_bias: 2, Zscore_valuebais: 1.5, bais_direction: low expression
cutoff 0.05
              Predicted_TRUE  Predicted_FALSE
Actual_TRUE                0               20
Actual_FALSE               0              180


n_comorbid: 4, n_comorbid_bias: 2, Zscore_valuebais: 2.0, bais_direction: high expression
cutoff 0.05
              Predicted_TRUE  Predicted_FALSE
Actual_TRUE                0               20
Actual_FALSE               0              180


n_comorbid: 4, n_comorbid_bias: 2, Zscore_valuebais: 2.0, bais_direction: low expression
cutoff 0.05
              Predicted_TRUE  Predicted_FALSE
Actual_TRUE                0               20
Actual_FALSE               0              180


n_comorbid: 4, n_comorbid_bias: 2, Zscore_valuebais: 2.5, bais_direction: high expression
cutoff 0.05
              Predicted_TRUE  Predicted_FALSE
Actual_TRUE                0               20
Actual_FALSE               0              180


n_comorbid: 4, n_comorbid_bias: 2, Zscore_valuebais: 2.5, bais_direction: low expression
cutoff 0.05
              Predicted_TRUE  Predicted_FALSE
Actual_TRUE                0               20
Actual_FALSE               0              180


n_comorbid: 4, n_comorbid_bias: 2, Zscore_valuebais: 3.0, bais_direction: high expression
cutoff 0.05
              Predicted_TRUE  Predicted_FALSE
Actual_TRUE                0               20
Actual_FALSE               0              180


n_comorbid: 4, n_comorbid_bias: 2, Zscore_valuebais: 3.0, bais_direction: low expression
cutoff 0.05
              Predicted_TRUE  Predicted_FALSE
Actual_TRUE                0               20
Actual_FALSE               0              180


n_comorbid: 4, n_comorbid_bias: 3, Zscore_valuebais: 1.0, bais_direction: high expression
cutoff 0.05
              Predicted_TRUE  Predicted_FALSE
Actual_TRUE                0               10
Actual_FALSE               0               90


n_comorbid: 4, n_comorbid_bias: 3, Zscore_valuebais: 1.0, bais_direction: low expression
cutoff 0.05
              Predicted_TRUE  Predicted_FALSE
Actual_TRUE                0               10
Actual_FALSE               0               90


n_comorbid: 4, n_comorbid_bias: 3, Zscore_valuebais: 1.5, bais_direction: high expression
cutoff 0.05
              Predicted_TRUE  Predicted_FALSE
Actual_TRUE                0               10
Actual_FALSE               0               90


n_comorbid: 4, n_comorbid_bias: 3, Zscore_valuebais: 1.5, bais_direction: low expression
cutoff 0.05
              Predicted_TRUE  Predicted_FALSE
Actual_TRUE                0               10
Actual_FALSE               0               90


In [None]:
final_df["log_"+which_pvalue_column]=np.log(final_df[which_pvalue_column])

