In [1]:
import pandas as pd
import numpy as np
import networkx as nx

In [2]:
df_combinations = pd.read_csv("filtered_data/df_combinations_3(classification).csv")

df_drug_cid = pd.read_csv("filtered_data/df_drug_cid.csv")

df_drug_protein = pd.read_csv("data/drug_protein_links.tsv", sep="\t")

In [3]:
cid_comb_filter = (df_combinations["Drug1"].isin(df_drug_cid["Drug"])) & (
    df_combinations["Drug2"].isin(df_drug_cid["Drug"])
)
df_combinations = df_combinations[cid_comb_filter]

In [4]:
cid_protein_filter = df_drug_protein["chemical"].isin(df_drug_cid["cIds"])
df_drug_protein = df_drug_protein[cid_protein_filter]

### Processing drug-protein interaction data

combined score coming from multiple sources should be given a boost, since we have more "proof"
Therefore we'll use normalize it by dividing by the no. of sources, and applying **logarithmic boost**


In [5]:
import joblib 
def calculate_adjusted_score(row):
    average_score = row["combined_score"] / row["source_count"]
    return average_score * (1 + np.log10(row["source_count"]))


sources = ["experimental", "prediction", "database", "textmining"]
df_drug_protein["source_count"] = df_drug_protein[sources].astype(bool).sum(axis=1)

df_drug_protein["adjusted_score"] = df_drug_protein.apply(
    calculate_adjusted_score, axis=1
)

df_drug_protein = df_drug_protein[["chemical", "protein", "adjusted_score"]]

df_drug_protein.to_csv(
    "filtered_data/df_drug_protein_1(adjusted_score).csv", index=False
)

In [6]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

min_max_scaler = MinMaxScaler()
df_drug_protein["min_max_score"] = min_max_scaler.fit_transform(
    df_drug_protein[["adjusted_score"]]
)

joblib.dump(min_max_scaler, 'scalers/2_minmax_adjustedscore.save')


standard_scaler = StandardScaler()
df_drug_protein["z_score"] = standard_scaler.fit_transform(
    df_drug_protein[["adjusted_score"]]
)

joblib.dump(standard_scaler, 'scalers/2_zscore_adjustedscore.save')


df_drug_protein.to_csv("filtered_data/df_drug_protein_2(normalized).csv", index=False)

In [7]:
df_drug_protein.head()

Unnamed: 0,chemical,protein,adjusted_score,min_max_score,z_score
7439087,CIDs91754554,9606.ENSP00000263640,217.0,0.133221,-0.287369
7444691,CIDs91617630,9606.ENSP00000250457,221.0,0.137684,-0.264418
7444692,CIDs91617630,9606.ENSP00000262306,260.0,0.18119,-0.040645
7444693,CIDs91617630,9606.ENSP00000272321,259.0,0.180074,-0.046383
7515421,CIDs90241673,9606.ENSP00000215829,168.0,0.07856,-0.568519


### More data for drug-drug


In [8]:
def create_combination_id(row):
    drug1_abbrev = row["Drug1"][:10]
    drug2_abbrev = row["Drug2"][:10]
    abbrevs = sorted([drug1_abbrev, drug2_abbrev])
    return f"{abbrevs[0]}_{abbrevs[1]}"


df_combinations["combination_id"] = df_combinations.apply(create_combination_id, axis=1)

In [9]:
df_combinations.to_csv("filtered_data/df_combinations_4(filtered).csv", index=False)

In [10]:
df_avg_scores = (
    df_combinations.groupby("combination_id")
    .agg({"ZIP": "mean", "Bliss": "mean", "Loewe": "mean", "HSA": "mean"})
    .reset_index()
)

df_avg_scores.columns = [
    "combination_id",
    "avg_zip_score",
    "avg_bliss_score",
    "avg_loewe_score",
    "avg_hsa_score",
]

df_avg_scores.to_csv("filtered_data/df_comb_avg_scores.csv")

### Graph construction
