In [None]:
!pip install -q "torch==2.2.1" "transformers==4.41.2" "datasets==2.20.0"


In [None]:
from datasets import load_dataset
import pandas as pd

ds = load_dataset("BALM/BALM-benchmark", "BindingDB_filtered", split="train")
df = ds.to_pandas()
df_5k = df.sample(n=5000, random_state=42)
df_5k.to_csv("bindingdb_5000.csv", index=False)
df_5k.head(), df_5k.shape

In [None]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, RobertaModel
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

df = pd.read_csv("bindingdb_5000.csv")

drug_model_name = "DeepChem/ChemBERTa-77M-MTR"
MAX_LEN = 256

tokenizer = AutoTokenizer.from_pretrained(drug_model_name)
backbone = RobertaModel.from_pretrained(drug_model_name)

for p in backbone.parameters():
    p.requires_grad = False  # نثبّت الباكبون

H = backbone.config.hidden_size
head = nn.Sequential(
    nn.Linear(H, 256),
    nn.ReLU(),
    nn.Linear(256, 1),
)

backbone.to(device)
head.to(device)

class BindingDB5000(Dataset):
    def __init__(self, frame):
        self.df = frame.reset_index(drop=True)
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        return str(row["Drug"])[:MAX_LEN], torch.tensor(row["Y"], dtype=torch.float32)

dataset = BindingDB5000(df)
loader = DataLoader(dataset, batch_size=16, shuffle=True)

mse = nn.MSELoss()
def scale_labels(y):
    return (y - 6.0) / 4.0  # من 2..10 إلى -1..1 تقريباً

optimizer = torch.optim.AdamW(head.parameters(), lr=1e-3)
EPOCHS = 3

backbone.eval()
for epoch in range(EPOCHS):
    total_loss, n = 0.0, 0
    for drug_seq, y in loader:
        optimizer.zero_grad()
        y = y.to(device)
        y_scaled = scale_labels(y)

        tok = tokenizer(
            list(drug_seq),
            padding=True,
            truncation=True,
            max_length=MAX_LEN,
            return_tensors="pt",
        ).to(device)

        with torch.no_grad():
            out = backbone(**tok).last_hidden_state[:, 0, :]
        pred = head(out).squeeze(-1)

        loss = mse(pred, y_scaled)
        loss.backward()
        optimizer.step()

        bs = y.size(0)
        total_loss += loss.item() * bs
        n += bs

    print(f"Epoch {epoch+1}/{EPOCHS} loss={total_loss/n:.4f}")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# 1) تحميل الداتا
df = pd.read_csv("bindingdb_10000.csv")

# لو العمود Y هو بالفعل pKd (قيم بين 2 و 10) استخدمه مباشرة
print(df["Y"].min(), df["Y"].max())

# 2) رسم الهستوجرام
plt.figure(figsize=(6,4), dpi=150)
plt.hist(df["Y"], bins=30, color="#4C72B0", edgecolor="black", alpha=0.8)
plt.xlabel("pKd")
plt.ylabel("Frequency")
plt.title("Distribution of pKd values in the BindingDB-derived dataset")
plt.grid(axis="y", alpha=0.2)

plt.tight_layout()
plt.savefig("figure2_pKd_distribution.png")
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# 1) تحميل الداتا
df = pd.read_csv("bindingdb_5000.csv") # Corrected filename

# لو العمود Y هو بالفعل pKd (قيم بين 2 و 10) استخدمه مباشرة
print(df["Y"].min(), df["Y"].max())

# 2) رسم الهستوجرام
plt.figure(figsize=(6,4), dpi=150)
plt.hist(df["Y"], bins=30, color="#4C72B0", edgecolor="black", alpha=0.8)
plt.xlabel("pKd")
plt.ylabel("Frequency")
plt.title("Distribution of pKd values in the BindingDB-derived dataset")
plt.grid(axis="y", alpha=0.2)

plt.tight_layout()
plt.savefig("figure2_pKd_distribution.png")
plt.show()

In [None]:
import torch
from pathlib import Path

save_dir = Path("chemberta_bindingdb_5k")
save_dir.mkdir(exist_ok=True)

# حفظ رأس الـ regression فقط (head)
torch.save(head.state_dict(), save_dir / "head.pt")

# حفظ إعدادات الموديل والتوكنيزر لإعادة الاستخدام
tokenizer.save_pretrained(save_dir)
backbone.save_pretrained(save_dir)

print("Saved to", save_dir)


In [None]:
import torch
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

df = pd.read_csv("bindingdb_5000.csv").reset_index(drop=True)

def scale_labels(y):
    return (y - 6.0) / 4.0

def inverse_scale(y_scaled):
    return y_scaled * 4.0 + 6.0

backbone.eval()
head.eval()

with torch.no_grad():
    for i in range(5):
        drug = str(df.loc[i, "Drug"])[:256]
        y_true = df.loc[i, "Y"]

        tok = tokenizer(
            [drug],
            padding=True,
            truncation=True,
            max_length=256,
            return_tensors="pt",
        ).to(device)

        out = backbone(**tok).last_hidden_state[:, 0, :]
        y_pred_scaled = head(out).squeeze(-1)
        y_pred = inverse_scale(y_pred_scaled).item()

        print(f"Example {i}: true={y_true:.3f}, pred={y_pred:.3f}")


In [None]:
import pandas as pd
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

df = pd.read_csv("bindingdb_5000.csv").reset_index(drop=True)

def scale_labels(y):
    return (y - 6.0) / 4.0

def inverse_scale(y_scaled):
    return y_scaled * 4.0 + 6.0

backbone.eval()
head.eval()

rows = []
with torch.no_grad():
    for i in range(20):  # أول 20 مثال، غيّر الرقم لو عايز أكتر
        drug = str(df.loc[i, "Drug"])[:256]
        target = str(df.loc[i, "Target"])[:60] + "..."
        y_true = df.loc[i, "Y"]

        tok = tokenizer(
            [drug],
            padding=True,
            truncation=True,
            max_length=256,
            return_tensors="pt",
        ).to(device)

        out = backbone(**tok).last_hidden_state[:, 0, :]
        y_pred_scaled = head(out).squeeze(-1)
        y_pred = inverse_scale(y_pred_scaled).item()

        rows.append({
            "Drug_ID": df.loc[i, "Drug_ID"],
            "Target_ID": df.loc[i, "Target_ID"],
            "Target_seq_head": target,
            "Affinity_true": round(y_true, 3),
            "Affinity_pred": round(y_pred, 3),
        })

result_df = pd.DataFrame(rows)
result_df


In [None]:
!git push


In [None]:
!pwd


In [None]:
!git clone https://github.com/Abdulazim2/Predicting-Drug-Target-Binding-Affinity.git


In [None]:
%cd Predicting-Drug-Target-Binding-Affinity


In [None]:
!ls -a


In [None]:
!cp /content/trins.ipynb .


In [None]:
!ls


In [None]:
!find /content -name "*.ipynb"
