# DrugComb Analysis

Analyzing drug combinations' efficacy on cancer cell treatment.


In [1]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import os
import networkx as nx
import chardet
import matplotlib.pyplot as plt
from scipy.stats import zscore
import dask.dataframe as dd


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## DATA PREPROCESSING

Read the data from the files and show the first 5 rows of the data.


In [2]:
path_combinations = "./data/drugcombs_scored.csv"
path_drug_info = "./data/drug_chemical_info.csv"

# detect encoding of a file - used first time to determine encoding of the files
# with open(path_combinations, 'rb') as f:
#    encoding_1 = chardet.detect(f.read())
# print(encoding_1)
# with open(path_drug_info, 'rb') as f:
#    encoding_2 = chardet.detect(f.read())
# print(encoding_2)

# load main data
df_combinations = pd.read_csv(path_combinations)

# load additional drug information data
df_drug_info = pd.read_csv(path_drug_info, encoding="ISO-8859-1")

metric_columns = ["ZIP", "Bliss", "Loewe", "HSA"]

In [3]:
df_combinations["Drug1"] = df_combinations["Drug1"].str.lower()
df_combinations["Drug2"] = df_combinations["Drug2"].str.lower()

df_drug_info["drugName"] = df_drug_info["drugName"].str.lower()
df_drug_info["drugNameOfficial"] = df_drug_info["drugNameOfficial"].str.lower()

In [4]:
# show the first 5 rows of the data
print(df_combinations.head())
print(df_drug_info.head())

# show dimensions of the data
print(df_combinations.shape)

# remove ID column, not needed
df_combinations.drop(columns=["ID"], inplace=True)

   ID Drug1    Drug2 Cell line    ZIP  Bliss  Loewe    HSA
0   1  5-fu  abt-888     A2058   1.72   6.26  -2.75   5.54
1   2  5-fu  abt-888     A2058   5.88  12.33   3.33  11.61
2   3  5-fu  abt-888     A2058   3.59  11.66   2.65  10.94
3   4  5-fu  abt-888     A2058  -0.85   5.15  -3.86   4.43
4   5  5-fu  azd1775     A2058  12.29  15.77  10.40  18.66
       drugName          cIds drugNameOfficial  molecularWeight  \
0  bendamustine  CIDs00065628     bendamustine        358.26284   
1    lonidamine  CIDs00039562       lonidamine        321.15810   
2  lenalidomide  CIDs00216326     lenalidomide        259.26062   
3    cladribine  CIDs00020279       cladribine        285.68698   
4   pentostatin  CIDs00439693      pentostatin        268.26914   

                                      smilesString  
0       CN1C2=C(C=C(C=C2)N(CCCl)CCCl)N=C1CCCC(=O)O  
1  C1=CC=C2C(=C1)C(=NN2CC3=C(C=C(C=C3)Cl)Cl)C(=O)O  
2            C1CC(=O)NC(=O)C1N2CC3=C(C2=O)C=CC=C3N  
3             C1C(C(OC1N2C=NC3=

### Handle missing values in the data.


In [5]:
# are there any missing values?
print(df_combinations.isnull().sum())
print(df_drug_info.isnull().sum())

# remove rows with missing values
df_combinations = df_combinations.dropna()
# this is still useful for cids
df_drug_info = df_drug_info.dropna(subset=["cIds"])


# how many rows are left?
print(df_combinations.shape)

Drug1        2
Drug2        8
Cell line    0
ZIP          0
Bliss        0
Loewe        1
HSA          0
dtype: int64
drugName              0
cIds                  0
drugNameOfficial    454
molecularWeight       0
smilesString        454
dtype: int64
(498854, 7)


In [6]:
def remove_outliers_iqr(df, column_name):
    Q1 = df[column_name].quantile(0.25)
    Q3 = df[column_name].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column_name] >= lower_bound) & (df[column_name] <= upper_bound)]


to_filter = ["ZIP", "Bliss", "Loewe", "HSA"]
for col in to_filter:
    df_combinations = remove_outliers_iqr(df_combinations, col)

df_combinations.to_csv("filtered_data/df_combinations_1(outliers).csv", index=False)

In [7]:
df_combinations = df_combinations.groupby(
    ["Drug1", "Drug2", "Cell line"], as_index=False
).agg({"ZIP": "mean", "Bliss": "mean", "Loewe": "mean", "HSA": "mean"})

df_combinations.to_csv("filtered_data/df_combinations_2(avg).csv", index=False)

In [8]:
def voting_classification(row):
    positive_counts = (row[metric_columns] > 0).sum()
    negative_counts = (row[metric_columns] <= 0).sum()

    if positive_counts > negative_counts:
        return "synergy"
    else:
        return "antagonism"


# SLOW: Use if needed
df_combinations["classification"] = df_combinations.apply(voting_classification, axis=1)
df_combinations.to_csv(
    "filtered_data/df_combinations_3(classification).csv", index=False
)

ORIGINAL PAPER DOES NOT USE Z-SCORES, BUT WE COULD TRY THEM LATER IF NEEDED - NOW COMMENTED OUT
Calculate z-scores for the drug combinations' efficacy scores and replace the original scores with the z-scores.


In [9]:
"""
# calculate z-scores for the drug combinations' efficacy scores
df_combinations['ZIP_zscore'] = zscore(df_combinations['ZIP'])
df_combinations['BLISS_zscore'] = zscore(df_combinations['Bliss'])
df_combinations['HSA_zscore'] = zscore(df_combinations['HSA'])
df_combinations['Loewe_zscore'] = zscore(df_combinations['Loewe'])

# drop the original scores
df_combinations = df_combinations.drop(columns=['ZIP', 'Bliss', 'HSA', 'Loewe'])
"""

"\n# calculate z-scores for the drug combinations' efficacy scores\ndf_combinations['ZIP_zscore'] = zscore(df_combinations['ZIP'])\ndf_combinations['BLISS_zscore'] = zscore(df_combinations['Bliss'])\ndf_combinations['HSA_zscore'] = zscore(df_combinations['HSA'])\ndf_combinations['Loewe_zscore'] = zscore(df_combinations['Loewe'])\n\n# drop the original scores\ndf_combinations = df_combinations.drop(columns=['ZIP', 'Bliss', 'HSA', 'Loewe'])\n"

Similarly to the original article decide the threshold for synergy and antagonism on each metric.
Upper 25% of the scores are considered as synergy, lower 25% as antagonism, and the rest as no effect / not sure.

update: used IQR for more robust method to remove outliers. More suitable for skewed distributions.


Standardize case so we can match with drug info


In [10]:
print(df_combinations.head())

df_drug_info.head()

     Drug1       Drug2     Cell line      ZIP    Bliss  Loewe    HSA  \
0  (+)-jq1     (+)-jq1            RD  11.6800  11.3200  1.280  2.340   
1  (+)-jq1     (+)-jq1       SMS-CTR   4.5600   2.0600 -3.030 -3.200   
2  (+)-jq1     (+)-jq1  SU-DIPG-XIII   9.2800  11.2300  1.530  1.970   
3  (+)-jq1     (+)-jq1          TMD8   1.5175   1.8475 -1.590  1.885   
4  (+)-jq1  10356-76-0         U-HO1   3.5900   1.9600 -4.015  2.770   

  classification  
0        synergy  
1     antagonism  
2        synergy  
3        synergy  
4        synergy  


Unnamed: 0,drugName,cIds,drugNameOfficial,molecularWeight,smilesString
0,bendamustine,CIDs00065628,bendamustine,358.26284,CN1C2=C(C=C(C=C2)N(CCCl)CCCl)N=C1CCCC(=O)O
1,lonidamine,CIDs00039562,lonidamine,321.1581,C1=CC=C2C(=C1)C(=NN2CC3=C(C=C(C=C3)Cl)Cl)C(=O)O
2,lenalidomide,CIDs00216326,lenalidomide,259.26062,C1CC(=O)NC(=O)C1N2CC3=C(C2=O)C=CC=C3N
3,cladribine,CIDs00020279,cladribine,285.68698,C1C(C(OC1N2C=NC3=C2N=C(N=C3N)Cl)CO)O
4,pentostatin,CIDs00439693,pentostatin,268.26914,C1C(C(OC1N2C=NC3=C2NC=NCC3O)CO)O


### Add additional information about the drugs to the main data.


In [11]:
unique_drugs = pd.DataFrame(
    list(set(df_combinations["Drug1"]).union(set(df_combinations["Drug2"]))),
    columns=["Drug"],
)

merged_df = unique_drugs.merge(
    df_drug_info[["drugName", "cIds"]], left_on="Drug", right_on="drugName", how="left"
)[["Drug", "cIds"]]

unmatched_drugs = merged_df[merged_df["cIds"].isna()][["Drug"]]

second_merge = unmatched_drugs.merge(
    df_drug_info[["drugNameOfficial", "cIds"]],
    left_on="Drug",
    right_on="drugNameOfficial",
    how="left",
    suffixes=("", "_official"),
)[["Drug", "cIds"]]

merged_df.update(second_merge)

print(merged_df["cIds"].isna().sum())

1830


Step3: fuzzy matching


In [12]:
from fuzzywuzzy import process

matched_cids = merged_df[merged_df["cIds"].notna()]["cIds"]

df_remaining_info = df_drug_info[~df_drug_info["cIds"].isin(matched_cids)]

unmatched_drugs = merged_df[merged_df["cIds"].isna()]

drug_names = pd.concat(
    [df_remaining_info["drugName"], df_remaining_info["drugNameOfficial"]]
).unique()
drug_names = drug_names[drug_names != "none"]


def get_best_match(drug, lb=89):
    best_match, score = process.extractOne(drug, drug_names)
    if score >= lb:
        return best_match
    else:
        return None


matches = unmatched_drugs["Drug"].apply(get_best_match)

In [13]:
matches.dropna(inplace=True)

matches_df = matches.to_frame("match")

matches_df["Drug"] = merged_df.iloc[matches_df.index]["Drug"]

final1 = matches_df.merge(
    df_remaining_info[["drugNameOfficial", "cIds"]],
    left_on="match",
    right_on="drugNameOfficial",
    how="left",
)[["match", "Drug", "cIds"]]
final1.update(
    matches_df.merge(
        df_remaining_info[["drugName", "cIds"]],
        left_on="match",
        right_on="drugName",
        how="left",
    )[["match", "Drug", "cIds"]]
)

matches_df["cIds"] = final1.set_index(matches_df.index)["cIds"]

merged_df.update(matches_df[["Drug", "cIds"]])

merged_df.dropna(inplace=True)

In [14]:
merged_df.to_csv("filtered_data/df_drug_cid.csv", index=False)

### REFER TO data_processing.ipynb FROM HERE ON OUT


## GRAPH CREATION AND NETWORK ANALYSIS

DONE: Create a subgraph for every cell line and calculate network statistics for each subgraph.


In [31]:
# print(df_combinations.head())
# are there rows with same Drug1, Drug2 but different cell lines?
grouped = (
    df_combinations.groupby(["Drug1", "Drug2"])["Cell line"]
    .nunique()
    .reset_index(name="count")
)
print(len(grouped[grouped["count"] > 1]))

10393


In [32]:
cell_lines = df_combinations["Cell line"].unique()

In [33]:
# go over all cell lines
cell_lines = df_combinations["Cell line"].unique()

for index, cell_line in enumerate(cell_lines):
    # create a subgraph for the cell line
    G = nx.Graph()
    df_cell_line = df_combinations[df_combinations["Cell line"] == cell_line]
    for i, row in df_cell_line.iterrows():
        G.add_edge(
            row["Drug1"],
            row["Drug2"],
            cell_line=row["Cell line"],
            ZIP=row["ZIP"],
            Bliss=row["Bliss"],
            Loewe=row["Loewe"],
            HSA=row["HSA"],
        )

    # calculate network statistics
    degree_centrality = nx.degree_centrality(G)
    closeness_centrality = nx.closeness_centrality(G)
    betweenness_centrality = nx.betweenness_centrality(G)
    eigenvector_centrality = nx.eigenvector_centrality(G, max_iter=1000)
    clustering_coefficient = nx.clustering(G)
    pagerank = nx.pagerank(G)
    hubs, authority = nx.hits(G)
    avg_neighbor_degree = nx.average_neighbor_degree(G)
    # edge_betweenness = nx.edge_betweenness_centrality(G)

    # visualize the subgraph
    # nx.draw(G, with_labels=True)
    # plt.title(cell_line)
    # plt.show()

    df_combinations_subset = df_combinations[
        df_combinations["Cell line"] == cell_line
    ].copy()

    # add network features to the filtered dataframe
    df_combinations_subset["degree_centrality_Drug1"] = df_combinations_subset[
        "Drug1"
    ].map(degree_centrality)
    df_combinations_subset["degree_centrality_Drug2"] = df_combinations_subset[
        "Drug2"
    ].map(degree_centrality)
    df_combinations_subset["closeness_centrality_Drug1"] = df_combinations_subset[
        "Drug1"
    ].map(closeness_centrality)
    df_combinations_subset["closeness_centrality_Drug2"] = df_combinations_subset[
        "Drug2"
    ].map(closeness_centrality)
    df_combinations_subset["betweenness_centrality_Drug1"] = df_combinations_subset[
        "Drug1"
    ].map(betweenness_centrality)
    df_combinations_subset["betweenness_centrality_Drug2"] = df_combinations_subset[
        "Drug2"
    ].map(betweenness_centrality)
    df_combinations_subset["eigenvector_centrality_Drug1"] = df_combinations_subset[
        "Drug1"
    ].map(eigenvector_centrality)
    df_combinations_subset["eigenvector_centrality_Drug2"] = df_combinations_subset[
        "Drug2"
    ].map(eigenvector_centrality)
    df_combinations_subset["clustering_coefficient_Drug1"] = df_combinations_subset[
        "Drug1"
    ].map(clustering_coefficient)
    df_combinations_subset["clustering_coefficient_Drug2"] = df_combinations_subset[
        "Drug2"
    ].map(clustering_coefficient)
    df_combinations_subset["pagerank_Drug1"] = df_combinations_subset["Drug1"].map(
        pagerank
    )
    df_combinations_subset["pagerank_Drug2"] = df_combinations_subset["Drug2"].map(
        pagerank
    )
    df_combinations_subset["hubs_Drug1"] = df_combinations_subset["Drug1"].map(hubs)
    df_combinations_subset["hubs_Drug2"] = df_combinations_subset["Drug2"].map(hubs)
    df_combinations_subset["authority_Drug1"] = df_combinations_subset["Drug1"].map(
        authority
    )
    df_combinations_subset["authority_Drug2"] = df_combinations_subset["Drug2"].map(
        authority
    )
    df_combinations_subset["avg_neighbor_degree_Drug1"] = df_combinations_subset[
        "Drug1"
    ].map(avg_neighbor_degree)
    df_combinations_subset["avg_neighbor_degree_Drug2"] = df_combinations_subset[
        "Drug2"
    ].map(avg_neighbor_degree)

    # remove this rows from the main dataframe
    df_combinations = df_combinations[df_combinations["Cell line"] != cell_line]
    # add the new rows to the main dataframe
    df_combinations = pd.concat(
        [df_combinations, df_combinations_subset], ignore_index=True
    )

    print(f"Finished computing for {index+1}/{len(cell_lines)}")

Finished computing for 1/122
Finished computing for 2/122
Finished computing for 3/122
Finished computing for 4/122
Finished computing for 5/122
Finished computing for 6/122


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x79e19cd8f470>>
Traceback (most recent call last):
  File "/home/nermin/.venv/arpi/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


Following cells are not used, because we split data into subgraphs by cell lines.


In [None]:
"""
# initialize graph
G = nx.Graph()

# populate graph with nodes and edges
for i, row in df_combinations.iterrows():
    G.add_edge(row['Drug1'], row['Drug2'], cell_line=row['Cell line'], ZIP=row['ZIP'], Bliss=row['Bliss'], Loewe=row['Loewe'], HSA=row['HSA'])
    #G.nodes[row['Drug1']]['molecular_weight'] = row['molecularWeight_Drug1']
    #G.nodes[row['Drug2']]['molecular_weight'] = row['molecularWeight_Drug2']

# visualize the graph
nx.draw(G, with_labels=True)
plt.show()
"""

Calculate some basic network statistics.


In [None]:
"""
# number of nodes and edges
print('Number of nodes:', G.number_of_nodes())
print('Number of edges:', G.number_of_edges())

# average degree
degrees = [val for (node, val) in G.degree()]
print('Average degree:', np.mean(degrees))

# average clustering coefficient
print('Average clustering coefficient:', nx.average_clustering(G))

# average shortest path length
print('Average shortest path length:', nx.average_shortest_path_length(G))

# density
print('Density:', nx.density(G))

# plot degree distribution, log-log scale
# not histogram
degrees = [degree for node, degree in G.degree()]
degree_count = np.bincount(degrees)
degree = np.arange(len(degree_count))

plt.figure()
plt.scatter(degree, degree_count, color='blue', alpha=0.7)
plt.yscale('log')
plt.xscale('log')
plt.xlabel('Degree')
plt.ylabel('Count')
plt.title('Degree distribution')
plt.show()
"""

Now calculate all network statistics, which will serve as features for the machine learning model.


In [None]:
"""
# calculate network features
degree_centrality = nx.degree_centrality(G)
closeness_centrality = nx.closeness_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G)
eigenvector_centrality = nx.eigenvector_centrality(G)
clustering_coefficient = nx.clustering(G)
pagerank = nx.pagerank(G)
hubs, authority = nx.hits(G)
avg_neighbor_degree = nx.average_neighbor_degree(G)

edge_betweenness = nx.edge_betweenness_centrality(G)
"""

In [None]:
"""
# add network features to the dataframe
df_combinations['degree_centrality_Drug1'] = df_combinations['Drug1'].map(degree_centrality)
df_combinations['degree_centrality_Drug2'] = df_combinations['Drug2'].map(degree_centrality)
df_combinations['closeness_centrality_Drug1'] = df_combinations['Drug1'].map(closeness_centrality)
df_combinations['closeness_centrality_Drug2'] = df_combinations['Drug2'].map(closeness_centrality)
df_combinations['betweenness_centrality_Drug1'] = df_combinations['Drug1'].map(betweenness_centrality)
df_combinations['betweenness_centrality_Drug2'] = df_combinations['Drug2'].map(betweenness_centrality)
df_combinations['eigenvector_centrality_Drug1'] = df_combinations['Drug1'].map(eigenvector_centrality)
df_combinations['eigenvector_centrality_Drug2'] = df_combinations['Drug2'].map(eigenvector_centrality)
df_combinations['clustering_coefficient_Drug1'] = df_combinations['Drug1'].map(clustering_coefficient)
df_combinations['clustering_coefficient_Drug2'] = df_combinations['Drug2'].map(clustering_coefficient)
df_combinations['pagerank_Drug1'] = df_combinations['Drug1'].map(pagerank)
df_combinations['pagerank_Drug2'] = df_combinations['Drug2'].map(pagerank)
df_combinations['hubs_Drug1'] = df_combinations['Drug1'].map(hubs)
df_combinations['hubs_Drug2'] = df_combinations['Drug2'].map(hubs)
df_combinations['authority_Drug1'] = df_combinations['Drug1'].map(authority)
df_combinations['authority_Drug2'] = df_combinations['Drug2'].map(authority)
df_combinations['avg_neighbor_degree_Drug1'] = df_combinations['Drug1'].map(avg_neighbor_degree)
df_combinations['avg_neighbor_degree_Drug2'] = df_combinations['Drug2'].map(avg_neighbor_degree)

"""

"""
# separate df for edge features
df_edge_features = pd.DataFrame({
    'Drug1': [edge[0] for edge in edge_betweenness.keys()],
    'Drug2': [edge[1] for edge in edge_betweenness.keys()],
    'edge_betweenness': list(edge_betweenness.values())
})

# merge edge features with the main dataframe
df_combinations = df_combinations.merge(df_edge_features, on=['Drug1', 'Drug2'], how='left')
"""

Again check for missing values in the data.


In [None]:
# are there any missing values?
print(df_combinations.isnull().sum())

In [None]:
# show the first 5 rows of the data
print(df_combinations.head())

# show dimensions of the data
print(df_combinations.shape)

## MACHINE LEARNING MODEL

First prepare the data for the machine learning model.


In [None]:
# 4 target metrics for classification
y_zip = df_combinations["ZIP"]
y_bliss = df_combinations["Bliss"]
y_hsa = df_combinations["HSA"]
y_loewe = df_combinations["Loewe"]

# all other features are X
# one-hot encode Drug1, Drug2, and Cell line TO-DO
# df_combinations = pd.get_dummies(df_combinations, columns=['Drug1', 'Drug2'])
# df_combinations = pd.get_dummies(df_combinations, columns=['Cell line'])


X = df_combinations.drop(
    columns=["ID", "ZIP", "Bliss", "HSA", "Loewe", "Cell line", "Drug1", "Drug2"]
)

Split the data into training and testing sets.


In [None]:
X_train, X_test, y_zip_train, y_zip_test = train_test_split(
    X, y_zip, test_size=0.2, random_state=42
)
_, _, y_bliss_train, y_bliss_test = train_test_split(
    X, y_bliss, test_size=0.2, random_state=42
)
_, _, y_hsa_train, y_hsa_test = train_test_split(
    X, y_hsa, test_size=0.2, random_state=42
)
_, _, y_loewe_train, y_loewe_test = train_test_split(
    X, y_loewe, test_size=0.2, random_state=42
)

# show dimensions of the data
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}")
print(f"y_zip_train: {y_zip_train.shape}, y_zip_test: {y_zip_test.shape}")
print(f"y_bliss_train: {y_bliss_train.shape}, y_bliss_test: {y_bliss_test.shape}")
print(f"y_hsa_train: {y_hsa_train.shape}, y_hsa_test: {y_hsa_test.shape}")
print(f"y_loewe_train: {y_loewe_train.shape}, y_loewe_test: {y_loewe_test.shape}")

Normalize the features.


In [None]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Now train different machine learning models to predict the efficacy of drug combinations


In [None]:
# import necessary libraries for models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

from sklearn.metrics import mean_squared_error

Linear regression model


In [None]:
# linear regression model
model_zip = LinearRegression()
model_zip.fit(X_train, y_zip_train)
y_zip_pred = model_zip.predict(X_test)
mse_zip = mean_squared_error(y_zip_test, y_zip_pred)
print("MSE ZIP:", mse_zip)

model_bliss = LinearRegression()
model_bliss.fit(X_train, y_bliss_train)
y_bliss_pred = model_bliss.predict(X_test)
mse_bliss = mean_squared_error(y_bliss_test, y_bliss_pred)
print("MSE Bliss:", mse_bliss)

model_hsa = LinearRegression()
model_hsa.fit(X_train, y_hsa_train)
y_hsa_pred = model_hsa.predict(X_test)
mse_hsa = mean_squared_error(y_hsa_test, y_hsa_pred)
print("MSE HSA:", mse_hsa)

model_loewe = LinearRegression()
model_loewe.fit(X_train, y_loewe_train)
y_loewe_pred = model_loewe.predict(X_test)
mse_loewe = mean_squared_error(y_loewe_test, y_loewe_pred)
print("MSE Loewe:", mse_loewe)

Random forest model


In [None]:
# random forest model
model_zip = RandomForestRegressor()
model_zip.fit(X_train, y_zip_train)
y_zip_pred = model_zip.predict(X_test)
mse_zip = mean_squared_error(y_zip_test, y_zip_pred)
print("MSE ZIP:", mse_zip)

model_bliss = RandomForestRegressor()
model_bliss.fit(X_train, y_bliss_train)
y_bliss_pred = model_bliss.predict(X_test)
mse_bliss = mean_squared_error(y_bliss_test, y_bliss_pred)
print("MSE Bliss:", mse_bliss)

model_hsa = RandomForestRegressor()
model_hsa.fit(X_train, y_hsa_train)
y_hsa_pred = model_hsa.predict(X_test)
mse_hsa = mean_squared_error(y_hsa_test, y_hsa_pred)
print("MSE HSA:", mse_hsa)

model_loewe = RandomForestRegressor()
model_loewe.fit(X_train, y_loewe_train)
y_loewe_pred = model_loewe.predict(X_test)
mse_loewe = mean_squared_error(y_loewe_test, y_loewe_pred)
print("MSE Loewe:", mse_loewe)

SVM


In [None]:
# support vector machine
model_zip = SVR()
model_zip.fit(X_train, y_zip_train)
y_zip_pred = model_zip.predict(X_test)
mse_zip = mean_squared_error(y_zip_test, y_zip_pred)
print("MSE ZIP:", mse_zip)

model_bliss = SVR()
model_bliss.fit(X_train, y_bliss_train)
y_bliss_pred = model_bliss.predict(X_test)
mse_bliss = mean_squared_error(y_bliss_test, y_bliss_pred)
print("MSE Bliss:", mse_bliss)

model_hsa = SVR()
model_hsa.fit(X_train, y_hsa_train)
y_hsa_pred = model_hsa.predict(X_test)
mse_hsa = mean_squared_error(y_hsa_test, y_hsa_pred)
print("MSE HSA:", mse_hsa)

model_loewe = SVR()
model_loewe.fit(X_train, y_loewe_train)
y_loewe_pred = model_loewe.predict(X_test)
mse_loewe = mean_squared_error(y_loewe_test, y_loewe_pred)
print("MSE Loewe:", mse_loewe)

NN model


In [None]:
# neural network model
model_zip = MLPRegressor()
model_zip.fit(X_train, y_zip_train)
y_zip_pred = model_zip.predict(X_test)
mse_zip = mean_squared_error(y_zip_test, y_zip_pred)
print("MSE ZIP:", mse_zip)

model_bliss = MLPRegressor()
model_bliss.fit(X_train, y_bliss_train)
y_bliss_pred = model_bliss.predict(X_test)
mse_bliss = mean_squared_error(y_bliss_test, y_bliss_pred)
print("MSE Bliss:", mse_bliss)

model_hsa = MLPRegressor()
model_hsa.fit(X_train, y_hsa_train)
y_hsa_pred = model_hsa.predict(X_test)
mse_hsa = mean_squared_error(y_hsa_test, y_hsa_pred)
print("MSE HSA:", mse_hsa)

model_loewe = MLPRegressor()
model_loewe.fit(X_train, y_loewe_train)
y_loewe_pred = model_loewe.predict(X_test)
mse_loewe = mean_squared_error(y_loewe_test, y_loewe_pred)
print("MSE Loewe:", mse_loewe)