In [1]:
import os, sys

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

print("PROJECT_ROOT:", PROJECT_ROOT)


PROJECT_ROOT: e:\antibody-seq-developability\antibody-seq-developability


In [2]:
from src.data_oas import load_oas_human_paired

df = load_oas_human_paired()
df.head()
df.shape


Resolving data files:   0%|          | 0/170 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/170 [00:00<?, ?it/s]

(1692921, 9)

In [3]:
df.head()
df.sample(5, random_state=0)
df.describe(include="all")
df["vh_cdr3"].str.len().describe()


count    1.692921e+06
mean     1.569112e+01
std      3.948171e+00
min      2.000000e+00
25%      1.300000e+01
50%      1.500000e+01
75%      1.800000e+01
max      5.800000e+01
Name: vh_cdr3, dtype: float64

In [4]:
def filter_reasonable_cdr3(df, min_len=5, max_len=30):
    df = df.copy()
    lengths = df["vh_cdr3"].str.len()
    return df[lengths.between(min_len, max_len)]
from src.data_oas import load_oas_human_paired

df = load_oas_human_paired()
df = filter_reasonable_cdr3(df)
df["vh_cdr3"].str.len().describe()


Resolving data files:   0%|          | 0/170 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/170 [00:00<?, ?it/s]

count    1.691058e+06
mean     1.567423e+01
std      3.910265e+00
min      5.000000e+00
25%      1.300000e+01
50%      1.500000e+01
75%      1.800000e+01
max      3.000000e+01
Name: vh_cdr3, dtype: float64

In [5]:
from src.data_oas import load_oas_human_paired
from src.features_cdr import compute_cdr3_features

df = load_oas_human_paired()
df = filter_reasonable_cdr3(df, min_len=5, max_len=30)

# dev subset pour dev rapide
df_small = df.sample(200_000, random_state=0).reset_index(drop=True)

df_feat = compute_cdr3_features(df_small)
df_feat[["vh_cdr3", "cdr3_len", "cdr3_hydro_mean", "cdr3_charge", "cdr3_aromatic_frac"]].head()


Resolving data files:   0%|          | 0/170 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/170 [00:00<?, ?it/s]

Unnamed: 0,vh_cdr3,cdr3_len,cdr3_hydro_mean,cdr3_charge,cdr3_aromatic_frac
0,AKLNYYDSSGSDY,13,-1.192308,-1.0,0.230769
1,ARELKEIIGATRGYYYYGMDV,21,-0.380952,0.0,0.190476
2,AREDYGDLYYFDY,13,-1.207692,-3.0,0.384615
3,ARPIAAAFDWYFDL,14,0.414286,-1.0,0.285714
4,AHQAYGGFDY,10,-0.72,-0.9,0.3


In [6]:
import numpy as np

for col in ["cdr3_len", "cdr3_hydro_mean", "cdr3_charge"]:
    df_feat[col + "_z"] = (df_feat[col] - df_feat[col].mean()) / df_feat[col].std()

df_feat["devscore"] = (
    df_feat["cdr3_len_z"]
    + df_feat["cdr3_hydro_mean_z"]
    + df_feat["cdr3_charge_z"].abs()
)

q_low, q_high = df_feat["devscore"].quantile([0.3, 0.7])

df_bin = df_feat[
    (df_feat["devscore"] <= q_low) | (df_feat["devscore"] >= q_high)
].copy()

df_bin["dev_label"] = (df_bin["devscore"] >= q_high).astype(int)
df_bin["dev_label"].value_counts(normalize=True)

q_low, q_high = df_feat["devscore"].quantile([0.3, 0.7])

df_bin = df_feat[
    (df_feat["devscore"] <= q_low) | (df_feat["devscore"] >= q_high)
].copy()

df_bin["dev_label"] = (df_bin["devscore"] >= q_high).astype(int)

df_bin["dev_label"].value_counts(normalize=True)


dev_label
0    0.500025
1    0.499975
Name: proportion, dtype: float64

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, classification_report

features = ["cdr3_len", "cdr3_hydro_mean", "cdr3_charge", "cdr3_aromatic_frac"]

X = df_bin[features].values
y = df_bin["dev_label"].values

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,  # garde le 50/50 dans train / test
)

clf = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("logreg", LogisticRegression(max_iter=1000)),
    ]
)

clf.fit(X_train, y_train)

y_proba = clf.predict_proba(X_test)[:, 1]
y_pred = clf.predict(X_test)

print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))


ROC-AUC: 0.9984285040069357
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     12002
           1       1.00      0.98      0.99     12001

    accuracy                           0.99     24003
   macro avg       0.99      0.99      0.99     24003
weighted avg       0.99      0.99      0.99     24003

