<a href="https://colab.research.google.com/github/Dowell-Lab/psea/blob/main/notebook_examples/PSEA_metrics_on_simulated_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import plotly.express as px
import numpy as np

# Pull in the raw data

In [2]:
url="https://raw.githubusercontent.com/Dowell-Lab/psea/refs/heads/main/testdata/sim_psea_scores_20241015-122448.adjpval.csv"
resultsdf = pd.read_csv(url, index_col=0)


In [30]:
#which_pvalue_column = "p_value_BenjaminiHochberg"
#which_pvalue_column = "p_value_bonf"
#which_pvalue_column = "p_value_BenjaminiYekutieli"
whick_pvalue_column = "p_value_holm"


# Label the simulated comorbidites as as TRUE or FALSE

## functions

In [31]:
#this code splits the column name to tell us which gene it was simulated from and what parmaters were used
def parse_simulated_binary_att(row):
  simulated_binary_attribute = row["binary_attribute"]
  # Split the simulated_binary_attribute string by underscores
  parts = simulated_binary_attribute.split('_')
  # Extract the gene name
  genename = "_".join(parts[0:4])
  # Extract the other values using a dictionary for easier parsing
  extracted_values = {}
  for part in parts[1:]:
      if 'Truesamplesize' in part:
          extracted_values['samples_true'] = int(part.replace('Truesamplesize', ''))
      elif 'biassamplesize' in part:
          extracted_values['samples_true_bias'] = int(part.replace('biassamplesize', ''))
      elif 'Zscorevaluebais' in part:
          extracted_values['Zscore_valuebais'] = float(part.replace('Zscorevaluebais', ''))
      elif 'sigma' in part:
          extracted_values['Zscore_valuebais_sigma'] = float(part.replace('sigma', ''))
      elif 'top' in part:
          extracted_values['top_or_bottom'] = "top"
      elif 'bottom' in part:
          extracted_values['top_or_bottom'] = "bottom"
      elif 'pba' in part:
          extracted_values['percent_binary_attributes_thatarevaluebias'] = float(part.replace('pba', ''))

  return genename, extracted_values


def label_True_False(df):
  df['Actual_Label'] = 'FALSE'  # Initialize all values to 'FP'
  # Create a boolean mask for rows where 'value' matches 'genename' and 'samples_true_bias' is not 0
  mask = (df['value'] == df['genename']) & (df['samples_true_bias'] > min_people_with_bias_transcription)
  # Set 'Actual_Label' to 'TRUE' for rows matching the mask
  df.loc[mask, 'Actual_Label'] = 'TRUE'
  return df

def mark_actul_TRUE_FALSE_links(df, min_people_with_bias_transcription=0):
  """ min_people_with_bias_transcription is a number. More than min_people_with_bias_transcription must have the bias for it to count as a TRUE gene-comorbid linkage"""
  df[["genename", "other_dict"]] = df.apply(lambda row: parse_simulated_binary_att(row), axis=1, result_type="expand")
  final_df = pd.concat([resultsdf.drop(['other_dict'], axis=1), resultsdf['other_dict'].apply(pd.Series)], axis=1)
  final_df = label_True_False(final_df)
  return final_df

# functions for creating ROC curve and confusion matrix

In [18]:
 def confusion_matrix(df, cutoff=0.05):
  confusiondf = df[[which_pvalue_column, "Actual_Label"]].copy()
  confusiondf["Predicted_Label"] = np.where(confusiondf[which_pvalue_column] <= cutoff, "TRUE", "FALSE")
  cm = confusiondf.groupby(["Actual_Label", "Predicted_Label"]).size().reset_index(name="count")
  cm['Confusion_Category'] = 'Unknown'
  cm.loc[(cm.Actual_Label == 'FALSE') & (cm.Predicted_Label == 'TRUE'), 'Confusion_Category'] = 'False Positive'
  cm.loc[(cm.Actual_Label == 'TRUE') & (cm.Predicted_Label == 'TRUE'), 'Confusion_Category'] = 'True Positive'
  cm.loc[(cm.Actual_Label == 'TRUE') & (cm.Predicted_Label == 'FALSE'), 'Confusion_Category'] = 'False Negative'
  cm.loc[(cm.Actual_Label == 'FALSE') & (cm.Predicted_Label == 'FALSE'), 'Confusion_Category'] = 'True Negative'
  try:
    TP = cm.loc[cm.Confusion_Category == 'True Positive', 'count'].values[0]
  except:
    TP = 0
  try:
    FN = cm.loc[cm.Confusion_Category == 'False Negative', 'count'].values[0]
  except:
    FN = 0
  try:
    FP = cm.loc[cm.Confusion_Category == 'False Positive', 'count'].values[0]
  except:
    FP = 0
  try:
    TN = cm.loc[cm.Confusion_Category == 'True Negative', 'count'].values[0]
  except:
    TN = 0
  FPrate = FP/(FP+TN)
  TPrate = TP/(TP+FN)
  return cm, TPrate, FPrate

def create_ROC_curve(df):
  TPrates = []
  FPrates = []
  cutoffs = [cutoff for cutoff in np.arange(0, 1.01, 0.01)]
  for cutoff in cutoffs:
    cm, TPrate, FPrate = confusion_matrix(final_df, cutoff=cutoff)
    TPrates.append(TPrate)
    FPrates.append(FPrate)
  ROCdf = pd.DataFrame({"TPrate": TPrates, "FPrate": FPrates, "cutoff": cutoffs})
  return ROCdf

def plot_ROC_curve(ROCdf):
  fig = px.line(ROCdf, x="FPrate", y="TPrate", title='ROC Curve')
  fig.update_layout(xaxis_range=[0, 1], yaxis_range=[0, 1])
  fig.update_xaxes(title_text="False Positive Rate")
  fig.update_yaxes(title_text="True Positive Rate")
  fig.show()

def plot_ROCdf_single_min_people_with_bias_transcription(df, min_people_with_bias_transcription):
  final_df = mark_actul_TRUE_FALSE_links(resultsdf)
  ROCdf = create_ROC_curve(final_df)
  plot_ROC_curve(ROCdf)

def plot_ROCdf_single_min_people_with_bias_transcription(df, min_people_with_bias_transcription):
  final_df = mark_actul_TRUE_FALSE_links(df, min_people_with_bias_transcription=min_people_with_bias_transcription)
  ROCdf = create_ROC_curve(final_df)
  plot_ROC_curve(ROCdf)

Unnamed: 0,TPrate,FPrate,cutoff
0,0.075263,0.000000,0.00
1,0.467895,0.003095,0.01
2,0.487368,0.005065,0.02
3,0.501579,0.006472,0.03
4,0.508947,0.007922,0.04
...,...,...,...
96,0.999737,0.999177,0.96
97,1.000000,1.000000,0.97
98,1.000000,1.000000,0.98
99,1.000000,1.000000,0.99


In [None]:
final_df["log_"+which_pvalue_column]=np.log(final_df[which_pvalue_column])

