In [138]:
import pandas as pd
from itertools import combinations
from scipy.stats import zscore
from sklearn.preprocessing import MinMaxScaler
import networkx as nx

from networkx.algorithms import bipartite

In [139]:

df_small = pd.read_csv("training_data/df_small[drug-protein].csv")
df_large =pd.read_csv("training_data/df_large[drug-protein].csv")

avail_drugs_small = set(df_small['Drug1'].unique().tolist() + df_small['Drug2'].unique().tolist())
avail_drugs_large = set(df_large['Drug1'].unique().tolist() + df_large['Drug2'].unique().tolist())
avail_cells_small = df_small['Cell line'].unique().tolist()


In [140]:
DC = nx.Graph()

DC.add_nodes_from(avail_drugs_small)
DC.add_nodes_from(avail_cells_small)

edges1 = [(row['Drug1'], row['Cell line'])  for _, row in df_small.iterrows()]
edges2 = [(row['Drug2'], row['Cell line'])  for _, row in df_small.iterrows()]

DC.add_edges_from(edges1)
DC.add_edges_from(edges2)

In [177]:
df_small.dtypes

Drug1                   object
Drug2                   object
Cell line               object
ZIP                    float64
Bliss                  float64
Loewe                  float64
HSA                    float64
classification          object
combination_id          object
chemical_1              object
chemical_2              object
z_score_sum            float64
min_max_score_sum      float64
degree_centrality_1    float64
degree_centrality_2    float64
clustering_1           float64
clustering_2           float64
pagerank_1             float64
pagerank_2             float64
closeness_1            float64
closeness_2            float64
betweenness_1          float64
betweenness_2          float64
dtype: object

In [143]:
def get_all_stats(DC, avail_drugs_small, df_og):
    drug_projection = bipartite.projected_graph(DC, avail_drugs_small)
    dc_deg_centrality = nx.degree_centrality(drug_projection)
    dc_cc = nx.clustering(drug_projection)
    dc_pagerank = nx.pagerank(drug_projection)
    dc_closeness = nx.closeness_centrality(drug_projection)
    dc_betweenness = nx.betweenness_centrality(drug_projection)

    def add_node_features(df, feature_dict, feature_name):
        df["dc_" + feature_name + "_1"] = df["Drug1"].map(feature_dict)
        df["dc_" + feature_name + "_2"] = df["Drug2"].map(feature_dict)
        return df

    df_og_extra = df_og.copy()

    df_og_extra = add_node_features(
    df_og_extra, dc_deg_centrality, "degree_centrality"
    )
    df_og_extra = add_node_features(df_og_extra, dc_cc, "clustering")
    df_og_extra = add_node_features(df_og_extra, dc_pagerank, "pagerank")
    df_og_extra = add_node_features(df_og_extra, dc_closeness, "closeness")
    df_og_extra = add_node_features(df_og_extra, dc_betweenness, "betweenness")

    return df_og_extra 


df_small_extra = get_all_stats(DC, avail_drugs_small, df_small)

chemical_1    0
dtype: int64

In [133]:
df_small_extra.to_csv("training_data/df_small[both].csv",index=False)

In [144]:
from itertools import combinations 

all_pairs = list(combinations(avail_drugs_small, 2))

existing_pairs = set(tuple(sorted([row['Drug1'], row['Drug2']])) for index, row in df_small.iterrows())
new_pairs = [pair for pair in all_pairs if pair not in existing_pairs]
new_data = []
for cell_line in avail_cells_small:
    for drug1, drug2 in new_pairs:
        new_data.append({
            'Drug1': drug1,
            'Drug2': drug2,
            'Cell line': cell_line
        })

# Create a new dataframe from the new pairs
df_new_combinations = pd.DataFrame(new_data)

df_new_combinations.to_csv("new_combinations.csv",index=False)

In [250]:
df_drug_protein = pd.read_csv("filtered_data/df_drug_protein_2(normalized).csv")
df_drug_cid = pd.read_csv("filtered_data/df_drug_cid.csv")


def get_avg_z(row):
    z1 = df_drug_protein[df_drug_protein["chemical"] == row[0]]["z_score"]
    z2 = df_drug_protein[df_drug_protein["chemical"] == row[1]]["z_score"]
    return (z1 + z2) / 2


def get_avg_mm(row):
    z1 = df_drug_protein[df_drug_protein["chemical"] == row["chemical_1"]][
        "min_max_score"
    ]
    z2 = df_drug_protein[df_drug_protein["chemical"] == row["chemical_2"]][
        "min_max_score"
    ]
    return (z1 + z2) / 2


def get_test_data(df_new_comb, DC, avail_drugs, sample_size=100):
    df_test = df_new_comb.sample(sample_size).reset_index()
    DC_test = DC.copy()
    edges1 = [(row["Drug1"], row["Cell line"]) for _, row in df_test.iterrows()]
    edges2 = [(row["Drug2"], row["Cell line"]) for _, row in df_test.iterrows()]
    DC_test.add_edges_from(edges1)
    DC_test.add_edges_from(edges2)
    c1 = [
        "Drug1",
        "degree_centrality_1",
        "clustering_1",
        "pagerank_1",
        "closeness_1",
        "betweenness_1",
    ]
    c2 = [
        "Drug2",
        "degree_centrality_2",
        "clustering_2",
        "pagerank_2",
        "closeness_2",
        "betweenness_2",
    ]
    df2 = df_test.merge(df_drug_cid, how="left", left_on="Drug1", right_on="Drug")[
        ["Drug1", "Drug2", "Cell line", "cIds"]
    ]
    df2 = df2.rename(columns={"cIds": "chemical_1"})
    df2 = df2.merge(df_drug_cid[["Drug", "cIds"]], left_on="Drug2", right_on="Drug")[
        ["Drug1", "Drug2", "Cell line", "chemical_1", "cIds"]
    ]
    df2 = df2.rename(columns={"cIds": "chemical_2"})
    df_drug_protein_avg = (
        df_drug_protein[["chemical", "min_max_score", "z_score", "min_max_score"]]
        .groupby("chemical")
        .mean()
        .reset_index()
    )
    df2 = df2.merge(
        df_drug_protein_avg[["chemical", "z_score", "min_max_score"]],
        how="left",
        left_on="chemical_1",
        right_on="chemical",
        suffixes=("", "_1"),
    )
    df2 = df2.merge(
        df_drug_protein_avg[["chemical", "z_score", "min_max_score"]],
        how="left",
        left_on="chemical_2",
        right_on="chemical",
        suffixes=("", "_2"),
    )
    df2["z_score_sum"] = df2[["z_score", "z_score_2"]].mean(axis=1)
    df2["min_max_score_sum"] = df2[["min_max_score", "min_max_score_2"]].mean(axis=1)
    df2.drop(
        columns=[
            "chemical",
            "z_score",
            "z_score_2",
            "chemical_2",
            "chemical_1",
            "min_max_score_2",
            "min_max_score",
        ],
        inplace=True,
    )

    df2 = df2.merge(df_small[c1], on="Drug1", how="left").drop_duplicates()
    df2 = df2.merge(df_small[c2], on='Drug2', how='left').drop_duplicates()
    df2 = df2.dropna()
    df_combined = pd.concat([df_small, test_data]).drop(
    columns=[
        "Bliss",
        "HSA",
        "Loewe",
        "ZIP",
        "chemical_1",
        "chemical_2",
        "classification",
        "combination_id"
        ]
    )

    


print(DC)
test_data = get_test_data(df_new_combinations, DC, avail_drugs_small)

set(df_small.columns) -set(test_data.columns) 

Graph with 292 nodes and 2647 edges


{'Bliss',
 'HSA',
 'Loewe',
 'ZIP',
 'chemical_1',
 'chemical_2',
 'classification',
 'combination_id'}

In [252]:
df_combined = pd.concat([df_small, test_data]).drop(
    columns=[
        "Bliss",
        "HSA",
        "Loewe",
        "ZIP",
        "chemical_1",
        "chemical_2",
        "classification",
        "combination_id"
    ]
)

df_combined

Unnamed: 0,Drug1,Drug2,Cell line,z_score_sum,min_max_score_sum,degree_centrality_1,degree_centrality_2,clustering_1,clustering_2,pagerank_1,pagerank_2,closeness_1,closeness_2,betweenness_1,betweenness_2
0,(+)-jq1,camptothecin,U-HO1,-0.026083,0.009509,0.045662,0.215373,0.864407,0.587700,0.000525,0.001798,0.498479,0.555321,0.000059,0.000485
1,(+)-jq1,zolinza,DIPG25,-0.138251,0.013349,0.045662,0.245053,0.864407,0.513438,0.000525,0.002088,0.498479,0.566855,0.000059,0.001591
2,117048-62-1,chloroquine,TC-32,0.425315,0.009691,0.009893,0.273212,0.782051,0.477412,0.000216,0.002280,0.440782,0.575573,0.000007,0.001300
3,2-methoxyestradiol,camptothecin,TC-71,0.422009,0.010783,0.094368,0.215373,0.801600,0.587700,0.000870,0.001798,0.518813,0.555321,0.000135,0.000485
4,200484-11-3,cycloheximide,U-HO1,4.787841,0.073546,0.335616,0.391172,0.383076,0.324732,0.002839,0.003343,0.598448,0.619094,0.002718,0.003470
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5643,chembl3348887,thalidomide,OVCAR3,0.104317,0.209374,0.216895,0.257230,0.589795,0.535933,0.001772,0.002071,0.557685,0.571555,0.000481,0.000560
6219,artesunate,verapamil,T-47D,0.133562,0.215059,0.073820,0.298326,0.933634,0.403087,0.000690,0.002596,0.513525,0.584562,0.000021,0.002849
6226,selumetinib,lestaurtinib,IGROV1,0.891263,0.362373,0.090563,0.253425,0.896026,0.375285,0.000820,0.002812,0.518402,0.567838,0.000020,0.006876
6234,hydroxyurea,clopidogrel,COLO 205,0.010436,0.191121,0.238204,0.214612,0.512042,0.524040,0.002066,0.001854,0.563927,0.555793,0.001252,0.001332


In [203]:
df2 = test_data.copy()



Unnamed: 0,Drug1,Drug2,Cell line,z_score_sum,min_max_score_sum
0,exemestane,tranexamic acid,UACC62,-0.263524,0.137857
1,cyanein,2-fluoro ara-a,HCT-15,-0.031262,0.183014
2,fingolimod,tacrolimus,SF-268,0.222219,0.232296
3,artemether,tacrine,MDA-MB-231,-0.047768,0.179805
4,crizotinib,orlistat,SK-MEL-5,0.037784,0.196438
5,vincristine,lestaurtinib,SF-268,1.450062,0.471015
6,topiramate,sulfadoxine,LNCAP,-0.060105,0.177406
7,testosterone,nutlin-3,NCIH1650,-0.21397,0.147492
8,ethinyl estradiol,sulfadoxine,A549,-0.223907,0.14556
9,thiotepa,panobinostat,CAKI-1,-0.030577,0.183147


In [198]:
df2

Unnamed: 0,Drug1,Drug2,Cell line,chemical_1,chemical_2,chemical_2.1,z_score_sum
0,exemestane,tranexamic acid,UACC62,CIDs00060198,CIDs00005526,CIDs00005526,-0.263524
1,cyanein,2-fluoro ara-a,HCT-15,CIDs05287620,CIDs00657237,CIDs00657237,-0.031262
2,fingolimod,tacrolimus,SF-268,CIDs00107969,CIDs00445643,CIDs00445643,0.222219
3,artemether,tacrine,MDA-MB-231,CIDs00068911,CIDs00001935,CIDs00001935,-0.047768
4,crizotinib,orlistat,SK-MEL-5,CIDs11626560,CIDs03034010,CIDs03034010,0.037784
5,vincristine,lestaurtinib,SF-268,CIDs60138149,CIDs00126565,CIDs00126565,1.450062
6,topiramate,sulfadoxine,LNCAP,CIDs05284627,CIDs00017134,CIDs00017134,-0.060105
7,testosterone,nutlin-3,NCIH1650,CIDs00006013,CIDs00216345,CIDs00216345,-0.21397
8,ethinyl estradiol,sulfadoxine,A549,CIDs00005991,CIDs00017134,CIDs00017134,-0.223907
9,thiotepa,panobinostat,CAKI-1,CIDs00005453,CIDs06918837,CIDs06918837,-0.030577
