### Importing Required Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score,
    classification_report, confusion_matrix
)
import os

### Loading Data

In [3]:
drug_interaction=pd.read_csv("drug_id_5693.csv")

cell_feature=pd.read_csv("cell_feature.csv")

drug_feature=pd.read_csv("feature197_300.csv")

### Drug Interaction Dataset Overview

- This dataset contains records of drug pairs tested on specific cell lines.
- Columns:
  - `g_id1`: Identifier for the first drug in the pair.
  - `g_id2`: Identifier for the second drug in the pair.
  - `cell`: The cell line where the drug pair was tested (e.g., `MDAMB468`, `BT549`).
  - `label`: Synergy status of the drug pair on the given cell line.
    - `0` indicates **synergistic** interaction.
    - `1` indicates **non-synergistic** interaction.

In [5]:
drug_interaction.head(3)

Unnamed: 0,g_id1,g_id2,cell,label
0,192,115,MDAMB468,1
1,16,50,BT549,0
2,162,93,BT549,0


In [6]:
drug_interaction.shape  #The `drug_interaction` dataset contains 5693 rows ie 5693 different interaction dataset 


(5693, 4)

### Class Distribution of Synergy Status

In [8]:
drug_interaction.label.value_counts()

label
0    4349
1    1344
Name: count, dtype: int64

### Cell Line Feature Data

- This table contains molecular features associated with each **cell line**.
- The first column identifies the **cell line** (e.g., `MDAMB468`).
- Subsequent columns correspond to gene expression levels 

In [10]:
cell_feature.head(3)

Unnamed: 0,cell,ENSG00000116237,ENSG00000162413,ENSG00000171603,ENSG00000160049,ENSG00000065526,ENSG00000117118,ENSG00000053371,ENSG00000076864,ENSG00000070831,...,ENSG00000156299,ENSG00000142166,ENSG00000159228,ENSG00000159231,ENSG00000183527,ENSG00000182093,ENSG00000182240,ENSG00000157617,ENSG00000160208,ENSG00000141959
0,MDAMB468,32.57,7.69,19.51,11.42,12.93,82.54,19.83,0.07,1.25,...,10.26,15.01,100.27,23.4,7.32,13.72,2.9,5.94,35.27,92.34
1,BT549,40.52,17.08,59.1,32.71,15.08,131.08,26.65,0.0,4.8,...,3.71,6.37,90.65,26.82,2.1,11.53,0.03,1.53,27.28,37.67
2,BT549,40.52,17.08,59.1,32.71,15.08,131.08,26.65,0.0,4.8,...,3.71,6.37,90.65,26.82,2.1,11.53,0.03,1.53,27.28,37.67


### Drug Feature Dataset

- This dataset contains numerical features representing chemical or structural properties of drugs.
- Columns labeled generically (e.g., `Column1.1` to `Column1.300`) correspond to various extracted features or descriptors.
- Each row represents a single drug’s feature vector.

In [12]:
drug_feature.head(3)

Unnamed: 0,Column1.1,Column1.2,Column1.3,Column1.4,Column1.5,Column1.6,Column1.7,Column1.8,Column1.9,Column1.10,...,Column1.291,Column1.292,Column1.293,Column1.294,Column1.295,Column1.296,Column1.297,Column1.298,Column1.299,Column1.300
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,0,0,1,0,1,1,0,...,0,0,0,1,1,0,1,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Dataset Summary: Unique Drugs and Cell Lines

- **Total unique drugs:** The count of distinct drugs involved in all drug pairs, combining both drug identifiers (`g_id1` and `g_id2`).
- **Total unique cell lines:** The count of distinct cell lines used in experiments.

In [14]:
# Calculate total unique drugs across both drug columns
total_unique_drugs = len(np.unique(np.concatenate([drug_interaction['g_id1'].unique(), drug_interaction['g_id2'].unique()])))

# Calculate total unique cell lines
total_unique_cells = len(cell_feature['cell'].unique())

# Print the results
print(f"Total unique drugs in dataset: {total_unique_drugs}")
print(f"Total unique cell lines in dataset: {total_unique_cells}")


Total unique drugs in dataset: 197
Total unique cell lines in dataset: 12


### Scaling Cell Features Using Min-Max Normalization

In [16]:
cell_features = cell_feature.drop_duplicates(subset='cell').set_index('cell')
scaler = MinMaxScaler()

cell_feature_scaled = scaler.fit_transform(cell_features)


In [17]:
cell_features = pd.DataFrame(cell_feature_scaled, index=cell_features.index, columns=cell_features.columns)
cell_features

Unnamed: 0_level_0,ENSG00000116237,ENSG00000162413,ENSG00000171603,ENSG00000160049,ENSG00000065526,ENSG00000117118,ENSG00000053371,ENSG00000076864,ENSG00000070831,ENSG00000133216,...,ENSG00000156299,ENSG00000142166,ENSG00000159228,ENSG00000159231,ENSG00000183527,ENSG00000182093,ENSG00000182240,ENSG00000157617,ENSG00000160208,ENSG00000141959
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MDAMB468,0.234642,0.134128,0.166276,0.177675,0.545106,0.359715,0.144047,0.02439,0.075521,0.116279,...,1.0,1.0,0.695484,0.746379,0.561847,0.435328,0.810056,1.0,0.968367,0.426126
BT549,0.460751,0.495004,0.630838,1.0,0.75144,1.0,0.297547,0.0,1.0,0.0,...,0.360976,0.20442,0.627923,0.856453,0.107143,0.310541,0.00838,0.226316,0.696599,0.133679
ZR751,0.181172,0.019985,0.0,0.0,0.007678,0.312096,0.235877,0.0,0.861979,0.0,...,0.211707,0.259669,0.0,0.258449,0.0,0.431339,0.0,0.092982,0.0,0.126725
KPL1,0.114903,0.044581,0.189862,0.199691,0.802303,0.408785,0.669818,1.0,0.450521,0.0,...,0.705366,0.065378,0.373622,0.18571,0.270035,0.403419,0.273743,0.0,0.346599,0.763454
HS578T,1.0,0.369716,1.0,0.049054,0.854127,0.62683,1.0,0.10453,0.239583,0.0,...,0.129756,0.0,1.0,1.0,0.122822,0.969801,0.240223,0.666667,0.169388,1.0
HUH7,0.471274,0.330899,0.215912,0.419081,0.330134,0.237436,0.274814,0.0,0.028646,0.0,...,0.0,0.205341,0.281551,0.0,0.179443,0.444444,0.0,0.092982,0.629252,0.331871
MCF7,0.0,0.009224,0.102675,0.099652,0.245681,0.452843,0.55368,0.310105,0.369792,0.023256,...,0.405854,0.316759,0.329869,0.224976,0.722997,0.683191,0.0,0.073684,0.437415,0.897614
A549,0.123151,0.0,0.186107,0.152955,0.0,0.497296,0.081702,0.0,0.5,0.116279,...,0.00878,0.41989,0.899712,0.141294,0.526132,0.674644,0.114525,0.257895,0.59898,0.0
UO31,0.189135,0.368563,0.530861,0.550792,0.21977,0.462076,0.673644,0.0,0.020833,1.0,...,0.21561,0.226519,0.45874,0.153202,0.085366,1.0,0.726257,0.375439,0.444218,0.266396
HCC1187,0.403584,1.0,0.132246,0.060255,1.0,0.0,0.0,0.114983,0.0,0.255814,...,0.774634,0.186924,0.482267,0.752173,0.135889,0.0,0.187151,0.707018,0.540136,0.552798


### Preparing Cell Features for Integration with Drug Interaction Data


In [19]:
cell_feature_dict = cell_features.to_dict(orient='index')

In [20]:
cell_feature_dict.keys()

dict_keys(['MDAMB468', 'BT549', 'ZR751', 'KPL1', 'HS578T', 'HUH7', 'MCF7', 'A549', 'UO31', 'HCC1187', 'ACHN', 'X786O'])

### Appending Cell Features to Drug Interaction Dataset
- Merge or join cell line feature data with the drug interaction dataset.
- The merging key is the `cell` identifier common to both datasets.

In [22]:
drug_interaction['cell_features'] = drug_interaction['cell'].map(cell_feature_dict)

In [23]:
drug_interaction

Unnamed: 0,g_id1,g_id2,cell,label,cell_features
0,192,115,MDAMB468,1,"{'ENSG00000116237': 0.23464163822525597, 'ENSG..."
1,16,50,BT549,0,"{'ENSG00000116237': 0.4607508532423209, 'ENSG0..."
2,162,93,BT549,0,"{'ENSG00000116237': 0.4607508532423209, 'ENSG0..."
3,55,10,BT549,0,"{'ENSG00000116237': 0.4607508532423209, 'ENSG0..."
4,24,20,BT549,0,"{'ENSG00000116237': 0.4607508532423209, 'ENSG0..."
...,...,...,...,...,...
5688,194,45,BT549,0,"{'ENSG00000116237': 0.4607508532423209, 'ENSG0..."
5689,44,91,BT549,0,"{'ENSG00000116237': 0.4607508532423209, 'ENSG0..."
5690,33,86,BT549,0,"{'ENSG00000116237': 0.4607508532423209, 'ENSG0..."
5691,69,68,HS578T,0,"{'ENSG00000116237': 1.0, 'ENSG00000162413': 0...."


### Expanding and Setting Cell Features in Drug Interaction Dataset

- The `cell_features` column in the `drug_interaction` dataframe contains nested data (e.g., a list or dictionary).
- `apply(pd.Series)` is used to expand each element of `cell_features` into separate columns.
- The original `cell_features` column is dropped.

In [25]:
cell_features_expanded = drug_interaction['cell_features'].apply(pd.Series)

drug_interaction = pd.concat([drug_interaction.drop(columns=['cell_features']), cell_features_expanded], axis=1)
drug_interaction

Unnamed: 0,g_id1,g_id2,cell,label,ENSG00000116237,ENSG00000162413,ENSG00000171603,ENSG00000160049,ENSG00000065526,ENSG00000117118,...,ENSG00000156299,ENSG00000142166,ENSG00000159228,ENSG00000159231,ENSG00000183527,ENSG00000182093,ENSG00000182240,ENSG00000157617,ENSG00000160208,ENSG00000141959
0,192,115,MDAMB468,1,0.234642,0.134128,0.166276,0.177675,0.545106,0.359715,...,1.000000,1.00000,0.695484,0.746379,0.561847,0.435328,0.810056,1.000000,0.968367,0.426126
1,16,50,BT549,0,0.460751,0.495004,0.630838,1.000000,0.751440,1.000000,...,0.360976,0.20442,0.627923,0.856453,0.107143,0.310541,0.008380,0.226316,0.696599,0.133679
2,162,93,BT549,0,0.460751,0.495004,0.630838,1.000000,0.751440,1.000000,...,0.360976,0.20442,0.627923,0.856453,0.107143,0.310541,0.008380,0.226316,0.696599,0.133679
3,55,10,BT549,0,0.460751,0.495004,0.630838,1.000000,0.751440,1.000000,...,0.360976,0.20442,0.627923,0.856453,0.107143,0.310541,0.008380,0.226316,0.696599,0.133679
4,24,20,BT549,0,0.460751,0.495004,0.630838,1.000000,0.751440,1.000000,...,0.360976,0.20442,0.627923,0.856453,0.107143,0.310541,0.008380,0.226316,0.696599,0.133679
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5688,194,45,BT549,0,0.460751,0.495004,0.630838,1.000000,0.751440,1.000000,...,0.360976,0.20442,0.627923,0.856453,0.107143,0.310541,0.008380,0.226316,0.696599,0.133679
5689,44,91,BT549,0,0.460751,0.495004,0.630838,1.000000,0.751440,1.000000,...,0.360976,0.20442,0.627923,0.856453,0.107143,0.310541,0.008380,0.226316,0.696599,0.133679
5690,33,86,BT549,0,0.460751,0.495004,0.630838,1.000000,0.751440,1.000000,...,0.360976,0.20442,0.627923,0.856453,0.107143,0.310541,0.008380,0.226316,0.696599,0.133679
5691,69,68,HS578T,0,1.000000,0.369716,1.000000,0.049054,0.854127,0.626830,...,0.129756,0.00000,1.000000,1.000000,0.122822,0.969801,0.240223,0.666667,0.169388,1.000000


In [26]:
drug_interaction.shape

(5693, 958)

### Mapping Drug Features to Drug Interaction Dataset and Replacing IDs with Corresponding Features

In [28]:
drug_feature_dict = drug_feature.to_dict(orient='index')

drug_interaction['g_id1_features'] = drug_interaction['g_id1'].map(drug_feature_dict)
drug_interaction['g_id2_features'] = drug_interaction['g_id2'].map(drug_feature_dict)


In [29]:
drug_interaction.head()

Unnamed: 0,g_id1,g_id2,cell,label,ENSG00000116237,ENSG00000162413,ENSG00000171603,ENSG00000160049,ENSG00000065526,ENSG00000117118,...,ENSG00000159228,ENSG00000159231,ENSG00000183527,ENSG00000182093,ENSG00000182240,ENSG00000157617,ENSG00000160208,ENSG00000141959,g_id1_features,g_id2_features
0,192,115,MDAMB468,1,0.234642,0.134128,0.166276,0.177675,0.545106,0.359715,...,0.695484,0.746379,0.561847,0.435328,0.810056,1.0,0.968367,0.426126,"{'Column1.1': 0, 'Column1.2': 0, 'Column1.3': ...","{'Column1.1': 1, 'Column1.2': 1, 'Column1.3': ..."
1,16,50,BT549,0,0.460751,0.495004,0.630838,1.0,0.75144,1.0,...,0.627923,0.856453,0.107143,0.310541,0.00838,0.226316,0.696599,0.133679,"{'Column1.1': 0, 'Column1.2': 0, 'Column1.3': ...","{'Column1.1': 0, 'Column1.2': 0, 'Column1.3': ..."
2,162,93,BT549,0,0.460751,0.495004,0.630838,1.0,0.75144,1.0,...,0.627923,0.856453,0.107143,0.310541,0.00838,0.226316,0.696599,0.133679,"{'Column1.1': 0, 'Column1.2': 0, 'Column1.3': ...","{'Column1.1': 0, 'Column1.2': 1, 'Column1.3': ..."
3,55,10,BT549,0,0.460751,0.495004,0.630838,1.0,0.75144,1.0,...,0.627923,0.856453,0.107143,0.310541,0.00838,0.226316,0.696599,0.133679,"{'Column1.1': 1, 'Column1.2': 0, 'Column1.3': ...","{'Column1.1': 0, 'Column1.2': 0, 'Column1.3': ..."
4,24,20,BT549,0,0.460751,0.495004,0.630838,1.0,0.75144,1.0,...,0.627923,0.856453,0.107143,0.310541,0.00838,0.226316,0.696599,0.133679,"{'Column1.1': 0, 'Column1.2': 0, 'Column1.3': ...","{'Column1.1': 0, 'Column1.2': 0, 'Column1.3': ..."


### Expanding and Setting Cell Features in Drug Interaction Dataset


In [32]:
g_id1_features_expanded = drug_interaction['g_id1_features'].apply(pd.Series).add_prefix('g1_')

g_id2_features_expanded = drug_interaction['g_id2_features'].apply(pd.Series).add_prefix('g2_')

drug_interaction = pd.concat([drug_interaction.drop(columns=['g_id1_features', 'g_id2_features']), 
                       g_id1_features_expanded, 
                       g_id2_features_expanded], axis=1)


In [33]:
drug_interaction.head()

Unnamed: 0,g_id1,g_id2,cell,label,ENSG00000116237,ENSG00000162413,ENSG00000171603,ENSG00000160049,ENSG00000065526,ENSG00000117118,...,g2_Column1.291,g2_Column1.292,g2_Column1.293,g2_Column1.294,g2_Column1.295,g2_Column1.296,g2_Column1.297,g2_Column1.298,g2_Column1.299,g2_Column1.300
0,192,115,MDAMB468,1,0.234642,0.134128,0.166276,0.177675,0.545106,0.359715,...,1,1,0,0,0,0,0,1,0,0
1,16,50,BT549,0,0.460751,0.495004,0.630838,1.0,0.75144,1.0,...,0,0,0,0,0,0,0,0,0,0
2,162,93,BT549,0,0.460751,0.495004,0.630838,1.0,0.75144,1.0,...,1,1,0,0,0,1,0,1,0,0
3,55,10,BT549,0,0.460751,0.495004,0.630838,1.0,0.75144,1.0,...,0,0,0,0,0,0,1,0,1,0
4,24,20,BT549,0,0.460751,0.495004,0.630838,1.0,0.75144,1.0,...,0,0,0,1,1,0,1,0,0,0


In [43]:
drug_interaction.drop(columns=['g_id1','g_id2','cell'],inplace=True)
drug_interaction.head()

Unnamed: 0,label,ENSG00000116237,ENSG00000162413,ENSG00000171603,ENSG00000160049,ENSG00000065526,ENSG00000117118,ENSG00000053371,ENSG00000076864,ENSG00000070831,...,g2_Column1.291,g2_Column1.292,g2_Column1.293,g2_Column1.294,g2_Column1.295,g2_Column1.296,g2_Column1.297,g2_Column1.298,g2_Column1.299,g2_Column1.300
0,1,0.234642,0.134128,0.166276,0.177675,0.545106,0.359715,0.144047,0.02439,0.075521,...,1,1,0,0,0,0,0,1,0,0
1,0,0.460751,0.495004,0.630838,1.0,0.75144,1.0,0.297547,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
2,0,0.460751,0.495004,0.630838,1.0,0.75144,1.0,0.297547,0.0,1.0,...,1,1,0,0,0,1,0,1,0,0
3,0,0.460751,0.495004,0.630838,1.0,0.75144,1.0,0.297547,0.0,1.0,...,0,0,0,0,0,0,1,0,1,0
4,0,0.460751,0.495004,0.630838,1.0,0.75144,1.0,0.297547,0.0,1.0,...,0,0,0,1,1,0,1,0,0,0


### Splitting Features and Target Variable

- `X` contains all columns except the `label`, i.e., it includes drug features and cell features used for prediction.
- `y` stores the `label` column, which represents the synergy class — 0 for synergistic and 1 for non-synergistic interactions.


In [49]:
X = drug_interaction.drop(columns=['label'])
y = drug_interaction['label']

### Splitting Dataset into Training and Testing Sets

- Split the feature matrix `X` and target vector `y` into training and testing subsets.


In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Training with Logistic Regression Model and Evaluating via Stratified K-Fold Cross-Validation

In [53]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

accuracy_list = []
precision_list = []
recall_list = []
f1_list = []


### Logistic Regression Training and Evaluation Using Stratified K-Fold Cross-Validation

This code performs 5-fold stratified cross-validation to evaluate the Logistic Regression model with balanced class weights.

- The dataset is split into 5 folds maintaining the original class distribution in each fold.
- For each fold:
  - Train on 4 folds and test on the remaining fold.
  - Calculate performance metrics: Accuracy, Weighted Precision, Weighted Recall, and Weighted F1-Score.
  - Display the confusion matrix with a heatmap visualization.
  - Print a detailed classification report to analyze per-class performance.
- This approach helps provide a robust estimate of model performance while addressing potential class imbalance.

In [42]:
output_dir = 'img_(drug_synergy_prediction without Feature extraction(without quantile only)'
os.makedirs(output_dir, exist_ok=True) 
fold = 1
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model = LogisticRegression(C=0.01, class_weight='balanced', max_iter=500)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted')
    rec = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"\nFold {fold}:")
    print(f"  Accuracy           = {acc:.4f}")
    print(f"  Weighted Precision = {prec:.4f}")
    print(f"  Weighted Recall    = {rec:.4f}")
    print(f"  Weighted F1-score  = {f1:.4f}")

    cm = confusion_matrix(y_test, y_pred)
    print("  Confusion Matrix:")
    print(cm)

    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title(f'Confusion Matrix - Fold {fold}')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
     

    
    plt.savefig(os.path.join(output_dir, f'confusion_matrix_fold_{fold}_before_tuning.png'))    
    plt.close()
    

    print("\nDetailed Classification Report:")
    print(classification_report(y_test, y_pred, digits=4))

    fold += 1
metrics_df = pd.DataFrame({
    'Fold': range(1, fold),
    'Accuracy': accuracy_list,
    'Weighted Precision': precision_list,
    'Weighted Recall': recall_list,
    'Weighted F1-score': f1_list
})

print("\n=== Summary of Cross-Validation Metrics ===")
print(metrics_df)

# Optionally display average metrics
print("\n=== Average Metrics Across Folds ===")
print(metrics_df.mean().to_frame(name='Average').T)


Fold 1:
  Accuracy           = 0.6646
  Weighted Precision = 0.7350
  Weighted Recall    = 0.6646
  Weighted F1-score  = 0.6866
  Confusion Matrix:
[[595 275]
 [107 162]]

Detailed Classification Report:
              precision    recall  f1-score   support

           0     0.8476    0.6839    0.7570       870
           1     0.3707    0.6022    0.4589       269

    accuracy                         0.6646      1139
   macro avg     0.6091    0.6431    0.6080      1139
weighted avg     0.7350    0.6646    0.6866      1139


Fold 2:
  Accuracy           = 0.6945
  Weighted Precision = 0.7599
  Weighted Recall    = 0.6945
  Weighted F1-score  = 0.7141
  Confusion Matrix:
[[615 255]
 [ 93 176]]

Detailed Classification Report:
              precision    recall  f1-score   support

           0     0.8686    0.7069    0.7795       870
           1     0.4084    0.6543    0.5029       269

    accuracy                         0.6945      1139
   macro avg     0.6385    0.6806    0.6412  

### Calculating and Printing Average Performance Metrics Across All 5 Folds

In [44]:
print("\n=== Average Across 5 Folds ===")
print(f"wt.Avg Accuracy       = {np.mean(accuracy_list):.4f}")
print(f"wt.Avg Macro Precision= {np.mean(precision_list):.4f}")
print(f"wt.Avg Macro Recall   = {np.mean(recall_list):.4f}")
print(f"wt.Avg Macro F1-score = {np.mean(f1_list):.4f}")


=== Average Across 5 Folds ===
wt.Avg Accuracy       = 0.6768
wt.Avg Macro Precision= 0.7456
wt.Avg Macro Recall   = 0.6768
wt.Avg Macro F1-score = 0.6977


### Hyperparameter Tuning for Logistic Regression using GridSearchCV
-     `class_weight`: balances the penalty of misclassifying each class.
- `C`: inverse of regularization strength; smaller values specify stronger regularization.

In [111]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'class_weight': [{0:1, 1:w} for w in [1, 2, 3, 4, 5]],
    'C': [0.01, 0.1, 1, 10]
}

clf = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, scoring='f1', cv=5)
clf.fit(X_train, y_train)
print(clf.best_params_)

{'C': 0.01, 'class_weight': {0: 1, 1: 3}}


### Training and Evaluating Logistic Regression with Optimized Hyperparameters


In [113]:
fold = 1
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model = LogisticRegression(C=0.01, class_weight={0: 1, 1: 3}, max_iter=500)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted')
    rec = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"\nFold {fold}:")
    print(f"  Accuracy           = {acc:.4f}")
    print(f"  Weighted Precision = {prec:.4f}")
    print(f"  Weighted Recall    = {rec:.4f}")
    print(f"  Weighted F1-score  = {f1:.4f}")

    cm = confusion_matrix(y_test, y_pred)
    print("  Confusion Matrix:")
    print(cm)

    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title(f'Confusion Matrix - Fold {fold}')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
      

    
    plt.savefig(os.path.join(output_dir, f'confusion_matrix_fold_{fold}_after_tuning.png'))    
    plt.close()

    print("\nDetailed Classification Report:")
    print(classification_report(y_test, y_pred, digits=4))

    fold += 1


Fold 1:
  Accuracy           = 0.6769
  Weighted Precision = 0.7349
  Weighted Recall    = 0.6769
  Weighted F1-score  = 0.6963
  Confusion Matrix:
[[615 255]
 [113 156]]

Detailed Classification Report:
              precision    recall  f1-score   support

           0     0.8448    0.7069    0.7697       870
           1     0.3796    0.5799    0.4588       269

    accuracy                         0.6769      1139
   macro avg     0.6122    0.6434    0.6143      1139
weighted avg     0.7349    0.6769    0.6963      1139


Fold 2:
  Accuracy           = 0.7138
  Weighted Precision = 0.7622
  Weighted Recall    = 0.7138
  Weighted F1-score  = 0.7298
  Confusion Matrix:
[[644 226]
 [100 169]]

Detailed Classification Report:
              precision    recall  f1-score   support

           0     0.8656    0.7402    0.7980       870
           1     0.4278    0.6283    0.5090       269

    accuracy                         0.7138      1139
   macro avg     0.6467    0.6842    0.6535  

### Calculating and Printing Average Performance Metrics Across All 5 Folds

In [115]:
print("\n=== Average Across 5 Folds ===")
print(f"wt.Avg Accuracy       = {np.mean(accuracy_list):.4f}")
print(f"wt.Avg Macro Precision= {np.mean(precision_list):.4f}")
print(f"wt.Avg Macro Recall   = {np.mean(recall_list):.4f}")
print(f"wt.Avg Macro F1-score = {np.mean(f1_list):.4f}")


=== Average Across 5 Folds ===
wt.Avg Accuracy       = 0.6867
wt.Avg Macro Precision= 0.7456
wt.Avg Macro Recall   = 0.6867
wt.Avg Macro F1-score = 0.7056


### Calculation of ROC AUC and Precision-Recall AUC (PR AUC)

In [64]:
model = LogisticRegression(C=0.01, class_weight={0: 1, 1: 3}, max_iter=1000)
model.fit(X_train, y_train)

In [65]:
y_pred = model.predict(X_test)
y_probs = model.predict_proba(X_test)[:, 1]

In [66]:
from sklearn.metrics import roc_auc_score

roc_auc = roc_auc_score(y_test, y_probs)
print("ROC-AUC Score:", roc_auc)


ROC-AUC Score: 0.7044935230045012


In [67]:
output_dir = 'img_(drug_synergy_prediction without Feature extraction(without quantile only)'

from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

fpr, tpr, thresholds = roc_curve(y_test, y_probs)

plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, label=f'Logistic Regression (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid(True)
plt.savefig(os.path.join(output_dir, f'rocauc.png'))    
plt.close()



In [68]:
from sklearn.metrics import average_precision_score

pr_auc = average_precision_score(y_test, y_probs)
print("PR-AUC Score:", pr_auc)


PR-AUC Score: 0.4550264311334503


In [69]:
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

precision, recall, thresholds = precision_recall_curve(y_test, y_probs)

plt.figure(figsize=(8,6))
plt.plot(recall, precision, label=f'Logistic Regression (PR-AUC = {pr_auc:.2f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.grid(True)
plt.savefig(os.path.join(output_dir, f'prauc.png'))    
plt.close()
