# Modeling & Evvaluations for Final Filtered Mut Netphos

## Import Most Up-to-Date Dataset

In [13]:

import pandas as pd
df = pd.read_feather('../data/filtered/mut/final-filtered-mut-netphos.feather')

df.head()

Unnamed: 0,Gene,wt,pos,mut,context,score,kinase,answer
0,ABCB1,S,35,N,KPTVSNFSM,0.47,CaM-II,.
1,ABCB1,S,35,N,KPTVSNFSM,0.469,PKA,.
2,ABCB1,S,35,N,KPTVSNFSM,0.44,GSK3,.
3,ABCB1,S,35,N,KPTVSNFSM,0.413,PKG,.
4,ABCB1,S,35,N,KPTVSNFSM,0.365,CKI,.


In [14]:
# fix answer column
df['answer'] = df['answer'].apply(lambda x: 1 if str(x).strip().upper() == 'YES' else 0).astype(int)
print(df['answer'].value_counts())
df.dtypes

answer
0    40544
1     2880
Name: count, dtype: int64


Gene        object
wt          object
pos          int64
mut         object
context     object
score      float64
kinase      object
answer       int64
dtype: object

## Build Preliminary Model

### Test-Train Split 

In [27]:

# 80-20 split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, auc, classification_report
)

X = df[['wt', 'mut', 'kinase', 'pos', 'score']]
X_encoded = pd.get_dummies(X, columns=['wt', 'mut', 'kinase'], drop_first=True)
y = df['answer']

X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, train_size=0.8, random_state=617, stratify=y
)

# scale our numeric variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


### Model Params + Predictions

In [28]:

log_model = LogisticRegression(max_iter=1000, class_weight='balanced')

# lets find the optimal value of C
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(log_model, param_grid, cv=5, scoring='f1')
grid.fit(X_train_scaled, y_train)

print('Best C:', grid.best_params_['C'])
print('Best f1 SCore:', grid.best_score_)

Best C: 100
Best f1 SCore: 0.9806245090248332


### Model Setup

In [29]:
final_model = LogisticRegression(
    C = 100,
    max_iter=1000,
    class_weight='balanced'
)

final_model.fit(X_train_scaled, y_train)
y_pred = final_model.predict(X_test_scaled)
y_prob = final_model.predict_proba(X_test_scaled)[:, 1]

### Model Evaluation

In [30]:

# Get basic evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))

# Compute Confusion Matrix
print('Confusion Matrix:')
print(classification_report(y_test, y_pred))


# Retrieve Classification Report
print('Classification Report')
print(classification_report(y_test, y_pred))

Accuracy: 0.9973517559009787
Precision: 0.9647058823529412
Recall: 0.9965277777777778
F1 Score: 0.9803586678052946
ROC AUC: 0.999410056213261
Confusion Matrix:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8109
           1       0.96      1.00      0.98       576

    accuracy                           1.00      8685
   macro avg       0.98      1.00      0.99      8685
weighted avg       1.00      1.00      1.00      8685

Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8109
           1       0.96      1.00      0.98       576

    accuracy                           1.00      8685
   macro avg       0.98      1.00      0.99      8685
weighted avg       1.00      1.00      1.00      8685

