In [None]:
import pandas as pd
import numpy as np
import scipy as sp
from scipy import stats
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import seaborn as sns
import statsmodels.api as sm
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import pdist, squareform
from scipy.optimize import curve_fit
from sklearn.metrics import r2_score
from scipy.stats import t

In [None]:
df_ctrp=pd.read_csv("../../Breast_cancer_heterogeneity_project/Drug_sensitivity_AUC_(CTD^2).csv")
df_ctrp.index=df_ctrp.iloc[:, 0]
df_ctrp.drop('Unnamed: 0', axis=1, inplace=True)
df_ctrp.columns=[i.split(' (')[0] for i in df_ctrp.columns]

In [None]:
h=pd.read_csv("/Volumes/SSD_Yue/NMF_Onco_gps_results/NMF_using_524_genes_selected_using_pan_cancer_cell_lines_including_MYC_on_30_updated_TNBC_cell_lines_normalized_for_samples_12222022/7/h.tsv", sep='\t', index_col=0)

In [None]:
cell_lines_order = ['ACH-000691', 'ACH-000196', 'ACH-000111', 'ACH-000276', 'ACH-000721', 'ACH-000148', 'ACH-001389', 'ACH-000621', 'ACH-000288', 'ACH-000573', 'ACH-000699', 'ACH-001391',
                   'ACH-001392', 'ACH-001819', 'ACH-000910', 'ACH-000374', 'ACH-001388', 'ACH-001390', 'ACH-001394',
                   'ACH-000643', 'ACH-000624', 'ACH-000857', 'ACH-000223', 'ACH-000668', 'ACH-000258',
                   'ACH-000856', 'ACH-000212', 'ACH-000768', 'ACH-000849', 'ACH-000536']

In [None]:
df_ctrp_tnbc = df_ctrp.loc[df_ctrp.index.isin(cell_lines_order),]
nan_counts = df_ctrp_tnbc.isna().sum()
columns_to_keep = nan_counts[nan_counts <= 10].index
df_ctrp_tnbc_cleaned = df_ctrp_tnbc[columns_to_keep]

In [None]:
h2=pd.DataFrame(sp.stats.zscore(h, axis=0)) #z-score for each column (column-wise), so that the computated z-score matches the h.html
h2.index=h.index
h2.columns=h.columns

In [None]:
#For F0
df_f0 =h2.loc[h2.index=="F0", h2.columns.isin(df_ctrp_tnbc.index)].transpose()
df_merged_f0=pd.merge(df_ctrp_tnbc_cleaned, df_f0, left_index=True, right_index=True)

last_col = df_merged_f0.iloc[:, -1]

# Loop through each column except the last one
correlations = {}
for col in df_merged_f0.columns[:-1]: # Exclude the last column
    df_tmpt = df_merged_f0[[col, "F0"]]
    df_tmpt = df_tmpt.replace([np.inf, -np.inf], np.nan).dropna()
    correlations[col] = pearsonr(df_tmpt[col], df_tmpt["F0"])
    
# Convert the dictionary to a DataFrame
correlation_df_f0 = pd.DataFrame(correlations.items(), columns=['Drug', 'Info'])
correlation_df_f0[['Correlation', 'P-value']] = pd.DataFrame(correlation_df_f0['Info'].tolist(), index=correlation_df_f0.index)
correlation_df_f0 = correlation_df_f0.drop(columns=['Info'])

In [None]:
correlation_df_f0["CCLE_model_state"] = "F0"
correlation_df_f0["State_description"] = "partial-EMT & NOTCH signaling"
correlation_df_f0["Salient_representative_genes"] = [["ELF3", "GRHL2", "CLDN4", "DSP", "CGN", "DSC2", "ALDH1A3","JAG2", "NOTCH1", "HES4", "HEY2", "NOTCH2", "NOTCH3"]] * len(correlation_df_f0)

In [None]:
correlation_df_f0.loc[(correlation_df_f0['P-value'] < 0.05) & (correlation_df_f0['Correlation'] < 0),]

In [None]:
plt.figure(figsize=(8, 6))

df_tmpt = df_merged_f0[["CIL70", "F0"]]
df_tmpt = df_tmpt.replace([np.inf, -np.inf], np.nan).dropna()

x= df_tmpt.iloc[:,0]
y= df_tmpt.iloc[:,1]

plt.scatter(x, y, color='black', marker='o',s=200)
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)

# Generate predicted y-values for the regression line
regression_line = slope * x + intercept

# Plot regression line
plt.plot(x, regression_line, color='red', label="Regression Line",linewidth=5)
plt.xlabel('compound AUC')
plt.ylabel('F0 expression')
plt.grid(False)
#plt.savefig('/Volumes/SSD_Yue/TNBC_paper_Fig2_10252024/CIL70_AUC_vs_F0_zscore_12112024.svg',dpi=300)
plt.show()

In [None]:
#For F1
df_f1 =h2.loc[h2.index=="F1", h2.columns.isin(df_ctrp_tnbc.index)].transpose()
df_merged_f1=pd.merge(df_ctrp_tnbc_cleaned, df_f1, left_index=True, right_index=True)

last_col = df_merged_f1.iloc[:, -1]

# Loop through each column except the last one
correlations = {}
for col in df_merged_f1.columns[:-1]: # Exclude the last column
    df_tmpt = df_merged_f1[[col, "F1"]]
    df_tmpt = df_tmpt.replace([np.inf, -np.inf], np.nan).dropna()
    correlations[col] = pearsonr(df_tmpt[col], df_tmpt["F1"])
    
# Convert the dictionary to a DataFrame
correlation_df_f1 = pd.DataFrame(correlations.items(), columns=['Drug', 'Info'])
correlation_df_f1[['Correlation', 'P-value']] = pd.DataFrame(correlation_df_f1['Info'].tolist(), index=correlation_df_f1.index)
correlation_df_f1 = correlation_df_f1.drop(columns=['Info'])

In [None]:
correlation_df_f1["CCLE_model_state"] = "F1"
correlation_df_f1["State_description"] = "MYC-high & EMT & hypoxia | glycolysis"
correlation_df_f1["Salient_representative_genes"] = [["MYC", "TGFBI", "SNAI2", "MMP2", "FN1", "CDH2", "VIM", "GSTP1", "LDHB"]] * len(correlation_df_f1)

In [None]:
plt.figure(figsize=(8, 6))

df_tmpt = df_merged_f1[["CBB-1007", "F1"]]
df_tmpt = df_tmpt.replace([np.inf, -np.inf], np.nan).dropna()

x= df_tmpt.iloc[:,0]
y= df_tmpt.iloc[:,1]

plt.scatter(x, y, color='black', marker='o',s=200)
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)

# Generate predicted y-values for the regression line
regression_line = slope * x + intercept

# Plot regression line
plt.plot(x, regression_line, color='red', label="Regression Line",linewidth=5)
plt.xlabel('compound AUC')
plt.ylabel('F1 expression')
plt.grid(False)
#plt.savefig('/Volumes/SSD_Yue/TNBC_paper_Fig2_10252024/CBB-1007_AUC_vs_F1_zscore_12112024.svg',dpi=300)
plt.show()

In [None]:
#For F2
df_f2 =h2.loc[h2.index=="F2", h2.columns.isin(df_ctrp_tnbc.index)].transpose()
df_merged_f2=pd.merge(df_ctrp_tnbc_cleaned, df_f2, left_index=True, right_index=True)

last_col = df_merged_f2.iloc[:, -1]

# Loop through each column except the last one
correlations = {}
for col in df_merged_f2.columns[:-1]: # Exclude the last column
    df_tmpt = df_merged_f2[[col, "F2"]]
    df_tmpt = df_tmpt.replace([np.inf, -np.inf], np.nan).dropna()
    correlations[col] = pearsonr(df_tmpt[col], df_tmpt["F2"])
    
# Convert the dictionary to a DataFrame
correlation_df_f2 = pd.DataFrame(correlations.items(), columns=['Drug', 'Info'])
correlation_df_f2[['Correlation', 'P-value']] = pd.DataFrame(correlation_df_f2['Info'].tolist(), index=correlation_df_f2.index)
correlation_df_f2 = correlation_df_f2.drop(columns=['Info'])

In [None]:
correlation_df_f2["CCLE_model_state"] = "F2"
correlation_df_f2["State_description"] = "luminal-like & PI3K-Akt signaling"
correlation_df_f2["Salient_representative_genes"] = [["CD24", "KRT19", "NRG3", "ERBB3", "TSPAN1", "FGFR4"]] * len(correlation_df_f2)

In [None]:
pd.set_option('display.max_row', None)
correlation_df_f2.loc[(correlation_df_f2['P-value'] < 0.05) & (correlation_df_f2['Correlation'] < 0),]

In [None]:
plt.figure(figsize=(8, 6))

df_tmpt = df_merged_f2[["PIK-93", "F2"]]
df_tmpt = df_tmpt.replace([np.inf, -np.inf], np.nan).dropna()

x= df_tmpt.iloc[:,0]
y= df_tmpt.iloc[:,1]

plt.scatter(x, y, color='blue', marker='o',s=200)
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)

# Generate predicted y-values for the regression line
regression_line = slope * x + intercept

# Plot regression line
plt.plot(x, regression_line, color='red', label="Regression Line",linewidth=5)
plt.xlabel('compound AUC')
plt.ylabel('F2 score')
plt.grid(False)
#plt.savefig('/Volumes/SSD_Yue/TNBC_paper_Fig2_10252024/PI-103_CCLE_F2_score_10272024.svg',dpi=300)
plt.show()

In [None]:
correlation_df_f2.loc[(correlation_df_f2['P-value'] < 0.05) & (correlation_df_f2['Correlation'] < 0),].Drug.tolist()

In [None]:
#For F3
df_f3 =h2.loc[h2.index=="F3", h2.columns.isin(df_ctrp_tnbc.index)].transpose()
df_merged_f3=pd.merge(df_ctrp_tnbc_cleaned, df_f3, left_index=True, right_index=True)

last_col = df_merged_f3.iloc[:, -1]

# Loop through each column except the last one
correlations = {}
for col in df_merged_f3.columns[:-1]: # Exclude the last column
    df_tmpt = df_merged_f3[[col, "F3"]]
    df_tmpt = df_tmpt.replace([np.inf, -np.inf], np.nan).dropna()
    correlations[col] = pearsonr(df_tmpt[col], df_tmpt["F3"])
    
# Convert the dictionary to a DataFrame
correlation_df_f3 = pd.DataFrame(correlations.items(), columns=['Drug', 'Info'])
correlation_df_f3[['Correlation', 'P-value']] = pd.DataFrame(correlation_df_f3['Info'].tolist(), index=correlation_df_f3.index)
correlation_df_f3 = correlation_df_f3.drop(columns=['Info'])

In [None]:
correlation_df_f3["CCLE_model_state"] = "F3"
correlation_df_f3["State_description"] = "EGFR signaling & NF-kB signaling"
correlation_df_f3["Salient_representative_genes"] = [["AREG", "TGFA", "BTC", "EPGN", "EFEMP1", "TNS4", "GPR87", "IL18", "TNIP2", "TNF", "TNFAIP2"]] * len(correlation_df_f3)

In [None]:
correlation_df_f3.loc[(correlation_df_f3['P-value'] < 0.1) & (correlation_df_f3['Correlation'] < 0),]

In [None]:
plt.figure(figsize=(8, 6))

df_tmpt = df_merged_f3[["erlotinib", "F3"]]
df_tmpt = df_tmpt.replace([np.inf, -np.inf], np.nan).dropna()

x= df_tmpt.iloc[:,0]
y= df_tmpt.iloc[:,1]

plt.scatter(x, y, color='black', marker='o',s=200)
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)

# Generate predicted y-values for the regression line
regression_line = slope * x + intercept

# Plot regression line
plt.plot(x, regression_line, color='red', label="Regression Line",linewidth=5)
plt.xlabel('compound AUC')
plt.ylabel('F3 expression')
plt.grid(False)
#plt.savefig('/Volumes/SSD_Yue/TNBC_paper_Fig2_10252024/Erlotinib_AUC_vs_EGFR_expression_11152024.svg',dpi=300)
plt.show()

In [None]:
plt.figure(figsize=(8, 6))

df_tmpt = df_merged_f3[["vandetanib", "F3"]]
df_tmpt = df_tmpt.replace([np.inf, -np.inf], np.nan).dropna()

x= df_tmpt.iloc[:,0]
y= df_tmpt.iloc[:,1]

plt.scatter(x, y, color='blue', marker='o',s=200)
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)

# Generate predicted y-values for the regression line
regression_line = slope * x + intercept

# Plot regression line
plt.plot(x, regression_line, color='red', label="Regression Line",linewidth=5)
plt.xlabel('compound AUC')
plt.ylabel('F3 score')
plt.grid(False)
plt.savefig('/Volumes/SSD_Yue/TNBC_paper_Fig2_10252024/vandetanib_CCLE_F3_score_10272024.svg',dpi=300)
plt.show()

In [None]:
#For F5
df_f5 =h2.loc[h2.index=="F5", h2.columns.isin(df_ctrp_tnbc.index)].transpose()
df_merged_f5=pd.merge(df_ctrp_tnbc_cleaned, df_f5, left_index=True, right_index=True)

last_col = df_merged_f5.iloc[:, -1]

# Loop through each column except the last one
correlations = {}
for col in df_merged_f5.columns[:-1]: # Exclude the last column
    df_tmpt = df_merged_f5[[col, "F5"]]
    df_tmpt = df_tmpt.replace([np.inf, -np.inf], np.nan).dropna()
    correlations[col] = pearsonr(df_tmpt[col], df_tmpt["F5"])
    
# Convert the dictionary to a DataFrame
correlation_df_f5 = pd.DataFrame(correlations.items(), columns=['Drug', 'Info'])
correlation_df_f5[['Correlation', 'P-value']] = pd.DataFrame(correlation_df_f5['Info'].tolist(), index=correlation_df_f5.index)
correlation_df_f5 = correlation_df_f5.drop(columns=['Info'])

In [None]:
correlation_df_f5["CCLE_model_state"] = "F5"
correlation_df_f5["State_description"] = "EMT & DNA damage response"
correlation_df_f5["Salient_representative_genes"] = [["SLFN12", "ZNF185", "GSDME", "ERCC2"]] * len(correlation_df_f5)

In [None]:
correlation_df_f5.loc[(correlation_df_f5['P-value'] < 0.05) & (correlation_df_f5['Correlation']< 0),].Drug.tolist()

In [None]:
plt.figure(figsize=(8, 6))

df_tmpt = df_merged_f5[["thalidomide", "F5"]]
df_tmpt = df_tmpt.replace([np.inf, -np.inf], np.nan).dropna()

x= df_tmpt.iloc[:,0]
y= df_tmpt.iloc[:,1]

plt.scatter(x, y, color='black', marker='o',s=200)
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)

# Generate predicted y-values for the regression line
regression_line = slope * x + intercept

# Plot regression line
plt.plot(x, regression_line, color='red', label="Regression Line",linewidth=5)
plt.xlabel('compound AUC')
plt.ylabel('F5 expression')
plt.grid(False)
plt.savefig('/Volumes/SSD_Yue/TNBC_paper_Fig2_10252024/Thalidomide_AUC_vs_F5_zscore_12112024.svg',dpi=300)
plt.show()

In [None]:
#For F6
df_f6 =h2.loc[h2.index=="F6", h2.columns.isin(df_ctrp_tnbc.index)].transpose()
df_merged_f6=pd.merge(df_ctrp_tnbc_cleaned, df_f6, left_index=True, right_index=True)

last_col = df_merged_f6.iloc[:, -1]

# Loop through each column except the last one
correlations = {}
for col in df_merged_f6.columns[:-1]: # Exclude the last column
    df_tmpt = df_merged_f6[[col, "F6"]]
    df_tmpt = df_tmpt.replace([np.inf, -np.inf], np.nan).dropna()
    correlations[col] = pearsonr(df_tmpt[col], df_tmpt["F6"])
    
# Convert the dictionary to a DataFrame
correlation_df_f6 = pd.DataFrame(correlations.items(), columns=['Drug', 'Info'])
correlation_df_f6[['Correlation', 'P-value']] = pd.DataFrame(correlation_df_f6['Info'].tolist(), index=correlation_df_f6.index)
correlation_df_f6 = correlation_df_f6.drop(columns=['Info'])

In [None]:
correlation_df_f6["CCLE_model_state"] = "F6"
correlation_df_f6["State_description"] = "MAGE protein family"
correlation_df_f6["Salient_representative_genes"] = [["MAGEA2", "MAGEA2B", "MAGEA3", "MAGEA6", "MAGEA12"]] * len(correlation_df_f6)

In [None]:
correlation_df_f6.loc[correlation_df_f6['P-value'] < 0.05,].Drug.tolist()

In [None]:
df_concat = pd.concat([correlation_df_f0, correlation_df_f1, correlation_df_f2, correlation_df_f3, correlation_df_f5, correlation_df_f6], ignore_index=True)

In [None]:
df_concat.to_csv("CCLE_NMF_state_score_association_with_CTRP_AUC_09172024.csv")

In [None]:
df_concat = pd.read_csv('CCLE_NMF_state_score_association_with_CTRP_AUC_09172024.csv')