In [None]:
import pandas as pd
import numpy as np
import scipy as sp
from scipy import stats
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import seaborn as sns
from statsmodels.stats.multitest import multipletests
from scipy.stats import ranksums

In [None]:
df_crispr=pd.read_csv("/Volumes/SSD_Yue/DepMap_Downloads/CRISPR_gene_effect_22Q2.csv")
df_crispr.index=df_crispr.iloc[:, 0]
df_crispr.drop('DepMap_ID', axis=1, inplace=True)
df_crispr.columns=[i.split(' (')[0] for i in df_crispr.columns]

In [None]:
h=pd.read_csv("/Volumes/SSD_Yue/NMF_Onco_gps_results/NMF_using_524_genes_selected_using_pan_cancer_cell_lines_including_MYC_on_30_updated_TNBC_cell_lines_normalized_for_samples_12222022/7/h.tsv", sep='\t', index_col=0)
w=pd.read_csv("/Volumes/SSD_Yue/NMF_Onco_gps_results/NMF_using_524_genes_selected_using_pan_cancer_cell_lines_including_MYC_on_30_updated_TNBC_cell_lines_normalized_for_samples_12222022/7/w.tsv", sep='\t', index_col=0)

In [None]:
cell_lines_order = ['ACH-000691', 'ACH-000196', 'ACH-000111', 'ACH-000276', 'ACH-000721', 'ACH-000148', 'ACH-001389', 'ACH-000621', 'ACH-000288', 'ACH-000573', 'ACH-000699', 'ACH-001391',
                   'ACH-001392', 'ACH-001819', 'ACH-000910', 'ACH-000374', 'ACH-001388', 'ACH-001390', 'ACH-001394',
                   'ACH-000643', 'ACH-000624', 'ACH-000857', 'ACH-000223', 'ACH-000668', 'ACH-000258',
                   'ACH-000856', 'ACH-000212', 'ACH-000768', 'ACH-000849', 'ACH-000536']

In [None]:
df1=h.apply(lambda x: x.argmax(), axis=0).to_frame()
df1.columns=['state']
df1['state']=df1['state'].astype(str)
df1['states']= df1['state'].map({'0': 'F0', '1': 'F1', '2': 'F2', '3': 'F3', '4': 'F4', '5': 'F5', '6': 'F6', '7': 'F7', '8': 'F8'})
dict_cell_lines={}
for i in w.columns.tolist():
    dict_cell_lines[i]=df1[df1.states==i].index.tolist()

In [None]:
df_crispr_tnbc = df_crispr.loc[df_crispr.index.isin(cell_lines_order),]

In [None]:
h2=pd.DataFrame(sp.stats.zscore(h, axis=0)) #z-score for each column (column-wise), so that the computated z-score matches the h.html
h2.index=h.index
h2.columns=h.columns

In [None]:
#For F0
df_f0 =h2.loc[h2.index=="F0", h2.columns.isin(df_crispr_tnbc.index)].transpose()
df_merged_f0=pd.merge(df_crispr_tnbc, df_f0, left_index=True, right_index=True)

last_col = df_merged_f0.iloc[:, -1]

# Loop through each column except the last one
correlations = {}
for col in df_merged_f0.columns[:-1]: # Exclude the last column
    df_tmpt = df_merged_f0[[col, "F0"]]
    df_tmpt = df_tmpt.replace([np.inf, -np.inf], np.nan).dropna()
    correlations[col] = pearsonr(df_tmpt[col], df_tmpt["F0"])
    
# Convert the dictionary to a DataFrame
correlation_df_f0 = pd.DataFrame(correlations.items(), columns=['Drug', 'Info'])
correlation_df_f0[['Correlation', 'P-value']] = pd.DataFrame(correlation_df_f0['Info'].tolist(), index=correlation_df_f0.index)
correlation_df_f0 = correlation_df_f0.drop(columns=['Info'])

In [None]:
correlation_df_f0["CCLE_model_state"] = "F0"
correlation_df_f0["State_description"] = "partial-EMT"
correlation_df_f0["Salient_representative_genes"] = [["ELF3", "GRHL2", "CLDN4", "DSP", "CGN", "DSC2", "ALDH1A3","JAG2", "NOTCH1", "HES4", "HEY2", "NOTCH2", "NOTCH3"]] * len(correlation_df_f0)

In [None]:
pd.set_option('display.max_rows', None)
correlation_df_f0.loc[(correlation_df_f0['P-value'] < 0.05) &(correlation_df_f0['Correlation'] < -0.6),]

In [None]:
df_tmpt_all = pd.DataFrame(columns=['Value', 'Gene', 'Group'])
for i in ["RBPJ", "MIB1"]:
    plt.figure()
    plt.figure(figsize=(8, 6))

    df_tmpt = df_merged_f0[[i]]
    df_tmpt['Gene']=i
    #df_tmpt = df_tmpt.replace([np.inf, -np.inf], np.nan).dropna()
    df_tmpt['flag'] = df_tmpt.index.isin(["ACH-000691", "ACH-000196", "ACH-000276", "ACH-000111"]).astype(int)
    df_tmpt.columns= ["Value", "Gene", "Group"]
    df_tmpt_all = df_tmpt_all.append(df_tmpt, ignore_index=True)
    #x= df_tmpt.iloc[:,0]
    #y= df_tmpt.iloc[:,1]
sns.boxplot(data=df_tmpt_all, x='Gene', y='Value', hue='Group')
    #plt.scatter(x, y, color='blue', marker='o')
    #slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)

# Generate predicted y-values for the regression line
    #regression_line = slope * x + intercept

# Plot regression line
    #plt.plot(x, regression_line, color='red', label="Regression Line")
plt.title('F0 essential genes')
plt.xlabel('Gene')
plt.ylabel('CRISPR score')
plt.grid(False)
#plt.savefig("/Volumes/SSD_Yue/TNBC_paper_Fig2_10252024/F0_CRISPR_essential_genes_MIB1_11032024.svg", format="svg")

In [None]:
#For F1
df_f1 =h2.loc[h2.index=="F1", h2.columns.isin(df_crispr_tnbc.index)].transpose()
df_merged_f1=pd.merge(df_crispr_tnbc, df_f1, left_index=True, right_index=True)

last_col = df_merged_f1.iloc[:, -1]

# Loop through each column except the last one
correlations = {}
for col in df_merged_f1.columns[:-1]: # Exclude the last column
    df_tmpt = df_merged_f1[[col, "F1"]]
    df_tmpt = df_tmpt.replace([np.inf, -np.inf], np.nan).dropna()
    correlations[col] = pearsonr(df_tmpt[col], df_tmpt["F1"])
    
# Convert the dictionary to a DataFrame
correlation_df_f1 = pd.DataFrame(correlations.items(), columns=['Drug', 'Info'])
correlation_df_f1[['Correlation', 'P-value']] = pd.DataFrame(correlation_df_f1['Info'].tolist(), index=correlation_df_f1.index)
correlation_df_f1 = correlation_df_f1.drop(columns=['Info'])

In [None]:
correlation_df_f1["CCLE_model_state"] = "F1"
correlation_df_f1["State_description"] = "MYC-high & EMT & hypoxia | glycolysis"
correlation_df_f1["Salient_representative_genes"] = [["MYC", "TGFBI", "SNAI2", "MMP2", "FN1", "CDH2", "VIM", "GSTP1", "LDHB"]] * len(correlation_df_f1)

In [None]:
correlation_df_f1.loc[(correlation_df_f1['P-value'] < 0.05) &(correlation_df_f1['Correlation'] < -0.5),]

In [None]:
df_tmpt_all = pd.DataFrame(columns=['Value', 'Gene', 'Group'])
for i in ["CDH2", "ITGAV"]:
    plt.figure()
    plt.figure(figsize=(8, 6))

    df_tmpt = df_merged_f1[[i]]
    df_tmpt['Gene']=i
    #df_tmpt = df_tmpt.replace([np.inf, -np.inf], np.nan).dropna()
    df_tmpt['flag'] = df_tmpt.index.isin(["ACH-000721", "ACH-000148", "ACH-001389", "ACH-000621", "ACH-000288", "ACH-000573", "ACH-000699", "ACH-001391"]).astype(int)
    df_tmpt.columns= ["Value", "Gene", "Group"]
    df_tmpt_all = df_tmpt_all.append(df_tmpt, ignore_index=True)
    #x= df_tmpt.iloc[:,0]
    #y= df_tmpt.iloc[:,1]
sns.boxplot(data=df_tmpt_all, x='Gene', y='Value', hue='Group')
    #plt.scatter(x, y, color='blue', marker='o')
    #slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)

# Generate predicted y-values for the regression line
    #regression_line = slope * x + intercept

# Plot regression line
    #plt.plot(x, regression_line, color='red', label="Regression Line")
plt.title('F1 essential genes')
plt.xlabel('Gene')
plt.ylabel('CRISPR score')
plt.grid(False)
plt.savefig("/Volumes/SSD_Yue/TNBC_paper_Fig2_10252024/F1_CRISPR_essential_genes_10252024.svg", format="svg")

In [None]:
#For F2
df_f2 =h2.loc[h2.index=="F2", h2.columns.isin(df_crispr_tnbc.index)].transpose()
df_merged_f2=pd.merge(df_crispr_tnbc, df_f2, left_index=True, right_index=True)

last_col = df_merged_f2.iloc[:, -1]

# Loop through each column except the last one
correlations = {}
for col in df_merged_f2.columns[:-1]: # Exclude the last column
    df_tmpt = df_merged_f2[[col, "F2_y"]]
    df_tmpt = df_tmpt.replace([np.inf, -np.inf], np.nan).dropna()
    correlations[col] = pearsonr(df_tmpt[col], df_tmpt["F2_y"])
    
# Convert the dictionary to a DataFrame
correlation_df_f2 = pd.DataFrame(correlations.items(), columns=['Drug', 'Info'])
correlation_df_f2[['Correlation', 'P-value']] = pd.DataFrame(correlation_df_f2['Info'].tolist(), index=correlation_df_f2.index)
correlation_df_f2 = correlation_df_f2.drop(columns=['Info'])

In [None]:
correlation_df_f2["CCLE_model_state"] = "F2"
correlation_df_f2["State_description"] = "luminal-like & PI3K-Akt signaling"
correlation_df_f2["Salient_representative_genes"] = [["CD24", "KRT19", "NRG3", "ERBB3", "TSPAN1", "FGFR4"]] * len(correlation_df_f2)

In [None]:
df_tmpt_all = pd.DataFrame(columns=['Value', 'Gene', 'Group'])
for i in ["PIK3CA","FOXA1", "GATA3"]:
    plt.figure()
    plt.figure(figsize=(8, 6))

    df_tmpt = df_merged_f2[[i]]
    df_tmpt['Gene']=i
    #df_tmpt = df_tmpt.replace([np.inf, -np.inf], np.nan).dropna()
    df_tmpt['flag'] = df_tmpt.index.isin(["ACH-001932", "ACH-001819", "ACH-000910"]).astype(int)
    df_tmpt.columns= ["Value", "Gene", "Group"]
    df_tmpt_all = df_tmpt_all.append(df_tmpt, ignore_index=True)
    #x= df_tmpt.iloc[:,0]
    #y= df_tmpt.iloc[:,1]
sns.boxplot(data=df_tmpt_all, x='Gene', y='Value', hue='Group')
    #plt.scatter(x, y, color='blue', marker='o')
    #slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)

# Generate predicted y-values for the regression line
    #regression_line = slope * x + intercept

# Plot regression line
    #plt.plot(x, regression_line, color='red', label="Regression Line")
plt.title('F2 essential genes')
plt.xlabel('Gene')
plt.ylabel('CRISPR score')
plt.grid(False)
plt.savefig("/Volumes/SSD_Yue/TNBC_paper_Fig2_10252024/F2_CRISPR_essential_genes_10252024.svg", format="svg")

In [None]:
#For F3
df_f3 =h2.loc[h2.index=="F3", h2.columns.isin(df_crispr_tnbc.index)].transpose()
df_merged_f3=pd.merge(df_crispr_tnbc, df_f3, left_index=True, right_index=True)

last_col = df_merged_f3.iloc[:, -1]

# Loop through each column except the last one
correlations = {}
for col in df_merged_f3.columns[:-1]: # Exclude the last column
    df_tmpt = df_merged_f3[[col, "F3_y"]]
    df_tmpt = df_tmpt.replace([np.inf, -np.inf], np.nan).dropna()
    correlations[col] = pearsonr(df_tmpt[col], df_tmpt["F3_y"])
    
# Convert the dictionary to a DataFrame
correlation_df_f3 = pd.DataFrame(correlations.items(), columns=['Drug', 'Info'])
correlation_df_f3[['Correlation', 'P-value']] = pd.DataFrame(correlation_df_f3['Info'].tolist(), index=correlation_df_f3.index)
correlation_df_f3 = correlation_df_f3.drop(columns=['Info'])

In [None]:
correlation_df_f3["CCLE_model_state"] = "F3"
correlation_df_f3["State_description"] = "EGFR signaling & NF-kB signaling"
correlation_df_f3["Salient_representative_genes"] = [["AREG", "TGFA", "BTC", "EPGN", "EFEMP1", "TNS4", "GPR87", "IL18", "TNIP2", "TNF", "TNFAIP2"]] * len(correlation_df_f3)

In [None]:
pd.set_option('display.max_rows', None)
correlation_df_f3.loc[(correlation_df_f3['P-value'] < 0.05) & (correlation_df_f3['Correlation'] < -0.5),]

In [None]:
df_tmpt_all = pd.DataFrame(columns=['Value', 'Gene', 'Group'])
for i in ["EGFR", "TSG101", "TRAF2"]:
    plt.figure()
    plt.figure(figsize=(8, 6))

    df_tmpt = df_merged_f3[[i]]
    df_tmpt['Gene']=i
    #df_tmpt = df_tmpt.replace([np.inf, -np.inf], np.nan).dropna()
    df_tmpt['flag'] = df_tmpt.index.isin(["ACH-000374", "ACH-001388", "ACH-001390", "ACH-001394", "ACH-000643","ACH-000624", "ACH-000857", "ACH-000223", "ACH-000668"]).astype(int)
    df_tmpt.columns= ["Value", "Gene", "Group"]
    df_tmpt_all = df_tmpt_all.append(df_tmpt, ignore_index=True)
    #x= df_tmpt.iloc[:,0]
    #y= df_tmpt.iloc[:,1]
sns.boxplot(data=df_tmpt_all, x='Gene', y='Value', hue='Group')
    #plt.scatter(x, y, color='blue', marker='o')
    #slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)

# Generate predicted y-values for the regression line
    #regression_line = slope * x + intercept

# Plot regression line
    #plt.plot(x, regression_line, color='red', label="Regression Line")
plt.title('F3 essential genes')
plt.xlabel('Gene')
plt.ylabel('CRISPR score')
plt.grid(False)
plt.savefig("/Volumes/SSD_Yue/TNBC_paper_Fig2_10252024/F3_CRISPR_essential_genes_10252024.svg", format="svg")

In [None]:
#For F5
df_f5 =h2.loc[h2.index=="F5", h2.columns.isin(df_crispr_tnbc.index)].transpose()
df_merged_f5=pd.merge(df_crispr_tnbc, df_f5, left_index=True, right_index=True)

last_col = df_merged_f5.iloc[:, -1]

# Loop through each column except the last one
correlations = {}
for col in df_merged_f5.columns[:-1]: # Exclude the last column
    df_tmpt = df_merged_f5[[col, "F5_y"]]
    df_tmpt = df_tmpt.replace([np.inf, -np.inf], np.nan).dropna()
    correlations[col] = pearsonr(df_tmpt[col], df_tmpt["F5_y"])
    
# Convert the dictionary to a DataFrame
correlation_df_f5 = pd.DataFrame(correlations.items(), columns=['Drug', 'Info'])
correlation_df_f5[['Correlation', 'P-value']] = pd.DataFrame(correlation_df_f5['Info'].tolist(), index=correlation_df_f5.index)
correlation_df_f5 = correlation_df_f5.drop(columns=['Info'])

In [None]:
correlation_df_f5["CCLE_model_state"] = "F5"
correlation_df_f5["State_description"] = "DNA damage response"
correlation_df_f5["Salient_representative_genes"] = [["SLFN12", "ZNF185", "GSDME", "ERCC2"]] * len(correlation_df_f5)

In [None]:
pd.set_option('display.max_rows', None)
correlation_df_f5.loc[(correlation_df_f5['P-value'] < 0.05) &(correlation_df_f5['Correlation'] < -0.5),]

In [None]:
df_tmpt_all = pd.DataFrame(columns=['Value', 'Gene', 'Group'])
for i in ["PPAT", "PAICS", "ATIC"]:
    plt.figure()
    plt.figure(figsize=(8, 6))

    df_tmpt = df_merged_f5[[i]]
    df_tmpt['Gene']=i
    #df_tmpt = df_tmpt.replace([np.inf, -np.inf], np.nan).dropna()
    df_tmpt['flag'] = df_tmpt.index.isin(["ACH-000856", "ACH-000212", "ACH-000768"]).astype(int)
    df_tmpt.columns= ["Value", "Gene", "Group"]
    df_tmpt_all = df_tmpt_all.append(df_tmpt, ignore_index=True)
    #x= df_tmpt.iloc[:,0]
    #y= df_tmpt.iloc[:,1]
sns.boxplot(data=df_tmpt_all, x='Gene', y='Value', hue='Group')
    #plt.scatter(x, y, color='blue', marker='o')
    #slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)

# Generate predicted y-values for the regression line
    #regression_line = slope * x + intercept

# Plot regression line
    #plt.plot(x, regression_line, color='red', label="Regression Line")
plt.title('F5 essential genes')
plt.xlabel('Gene')
plt.ylabel('CRISPR score')
plt.grid(False)
plt.savefig("/Volumes/SSD_Yue/TNBC_paper_Fig2_10252024/F5_CRISPR_essential_genes_10252024.svg", format="svg")

In [None]:
#For F6
df_f6 =h2.loc[h2.index=="F6", h2.columns.isin(df_crispr_tnbc.index)].transpose()
df_merged_f6 = pd.merge(df_crispr_tnbc, df_f6, left_index=True, right_index=True)

last_col = df_merged_f6.iloc[:, -1]

# Loop through each column except the last one
correlations = {}
for col in df_merged_f6.columns[:-1]: # Exclude the last column
    df_tmpt = df_merged_f6[[col, "F6"]]
    df_tmpt = df_tmpt.replace([np.inf, -np.inf], np.nan).dropna()
    correlations[col] = pearsonr(df_tmpt[col], df_tmpt["F6"])
    
# Convert the dictionary to a DataFrame
correlation_df_f6 = pd.DataFrame(correlations.items(), columns=['Drug', 'Info'])
correlation_df_f6[['Correlation', 'P-value']] = pd.DataFrame(correlation_df_f6['Info'].tolist(), index=correlation_df_f6.index)
correlation_df_f6 = correlation_df_f6.drop(columns=['Info'])

In [None]:
correlation_df_f6["CCLE_model_state"] = "F6"
correlation_df_f6["State_description"] = "MAGE protein family"
correlation_df_f6["Salient_representative_genes"] = [["MAGEA2", "MAGEA2B", "MAGEA3", "MAGEA6", "MAGEA12"]] * len(correlation_df_f6)

In [None]:
correlation_df_f6.loc[(correlation_df_f6['P-value'] < 0.05) & (correlation_df_f6['Correlation'] < -0.6),]

In [None]:
df_concat = pd.concat([correlation_df_f0, correlation_df_f1, correlation_df_f2, correlation_df_f3, correlation_df_f5, correlation_df_f6], ignore_index=True)

In [None]:
df_concat.to_csv("CCLE_NMF_state_score_association_with_DepMap_CRISPR_09172024.csv")