## Model Reproducibility Code

## WEEK 2: TASK 2

In [1]:
#import necessary libraries
import pandas as pd
from rdkit import Chem
from sklearn.metrics import matthews_corrcoef,accuracy_score,precision_score,confusion_matrix, f1_score,recall_score,balanced_accuracy_score


### Test I- Using the first external dataset

In [2]:
#reading the dataset downloaded from the publication Github page
test= pd.read_csv('external_test_set_pos.csv')
test.head()

Unnamed: 0,ACTIVITY,smiles
0,0,CCOC(=O)C1(CCN(C)CC1)c1ccccc1
1,0,CCN(CC)CC(=O)NC1=C(C)C=CC=C1C
2,0,CCCC(CCC)C(=O)O
3,0,CCC(COC(=O)c1cc(OC)c(OC)c(OC)c1)(c1ccccc1)N(C)C
4,0,COc1ccc(N(C(C)=O)c2cc3c(cc2[N+](=O)[O-])OC(C)(...


In [3]:
#reading the prediction output that was run on ersilia
prediction = pd.read_csv('reproducibility_prediction_output.csv')
prediction.head()

Unnamed: 0,key,input,probability
0,XADCESSVHJOZHK-UHFFFAOYSA-N,CCOC(=O)C1(CCN(C)CC1)c1ccccc1,0.645569
1,NNJVILVZKWQKPM-UHFFFAOYSA-N,CCN(CC)CC(=O)NC1=C(C)C=CC=C1C,0.088602
2,NIJJYAXOARWZEE-UHFFFAOYSA-N,CCCC(CCC)C(=O)O,0.042538
3,LORDFXWUHHSAQU-UHFFFAOYSA-N,CCC(COC(=O)c1cc(OC)c(OC)c(OC)c1)(c1ccccc1)N(C)C,0.060436
4,XZEITPHZKJCCSQ-UHFFFAOYSA-N,COc1ccc(N(C(C)=O)c2cc3c(cc2[N+](=O)[O-])OC(C)(...,0.038881


In [4]:
#set a threshold value to 0.5 
#convert the probability column to a binary prediction
predicted_output = (prediction['probability'] >= 0.5).astype(int)

# Extract the ACTIVITY column 
test_output = test['ACTIVITY'] 

From the publication, this is the meaning of the following term:
- tn = true negatives
- tp = true positives
- fp = false postives
- fn = false negatives
- NPV = Negative Predicted Values
- PPV = Positive Predicted Values
- SPE = Specificity
- SEN = Sensitivity
- B-ACC = Balanced Accuracy



In [5]:
# Calculate Matthews correlation coefficient
mcc = matthews_corrcoef(test_output, predicted_output)

# Calculate confusion matrix
conf_matrix = confusion_matrix(test_output, predicted_output)

# Calculate NPV
tn, fp, fn, tp = conf_matrix.ravel()
npv = tn / (tn + fn)

# Calculate accuracy
accuracy = accuracy_score(test_output, predicted_output)

# Calculate precision 
precision = precision_score(test_output, predicted_output)#represent PPV

# Calculate SPE
spe = tn / (tn + fp)

# Calculate recall
recall = recall_score(test_output, predicted_output)

# Calculate F1 score
f1 = f1_score(test_output, predicted_output)

# Calculate balanced accuracy
balanced_accuracy = balanced_accuracy_score(test_output, predicted_output)



In [6]:
print("Test set-I Result:")

# Print the results with three decimal places
print(f"MCC: {mcc:.3f}")
print(f"NPV: {npv:.3f}")
print(f"ACC: {accuracy:.3f}")
print(f"PPV: {precision:.3f}")
print(f"SPE: {spe:.3f}")
print(f"SEN: {recall:.3f}")
print(f"B-ACC: {balanced_accuracy:.3f}")


Test set-I Result:
MCC: 0.599
NPV: 0.688
ACC: 0.818
PPV: 0.893
SPE: 0.786
SEN: 0.833
B-ACC: 0.810


From the publication, this model is reproducable because I got the same result when I used author's dataset **"external_test_set_pos.csv"** which is test 1 set from the table below.
LINK TO THE TABLE: [here](https://jcheminf.biomedcentral.com/articles/10.1186/s13321-021-00541-z/tables/4)

### Test II - Using the second dataset

In [7]:
#reading the second dataset downloaded from the publication Github page
test= pd.read_csv('external_test_set_neg.csv')
test.head()

Unnamed: 0,ACTIVITY,smiles
0,1,O=C(CCC[N+]1CCC(O)(c2ccc(Cl)cc2)CC1)c1ccc(F)cc1
1,1,CCC[N+]1C[C@H](CSC)C[C@@H]2c3cccc4c3C(C=N4)C[C...
2,1,CC(C)(C)c1ccc(C(=O)CCC[N+]2CCC(OC(c3ccccc3)c3c...
3,1,CC(C)COC[C@@H](CN(Cc1ccccc1)c1ccccc1)[N+]1CCCC1
4,1,C[N+]1[C@H]2CC[C@@H]1C[C@H](OC(=O)C(CO)c1ccccc...


In [8]:
#reading the prediction output that was generated from the model eos2ta5 on ersilia
prediction= pd.read_csv('test2_reproducibility_prediction_output.csv')
prediction.head()

Unnamed: 0,key,input,probability
0,ICMLRJJLJVVQGT-UHFFFAOYSA-N,O=C(CCC[N+]1CCC(O)(c2ccc(Cl)cc2)CC1)c1ccc(F)cc1,0.702687
1,BLUAASNRIJJGKL-RILYDNKNSA-N,CCC[N+]1C[C@H](CSC)C[C@@H]2c3cccc4c3C(C=N4)C[C...,0.917343
2,YPYUSOLNFCETJV-UHFFFAOYSA-N,CC(C)(C)c1ccc(C(=O)CCC[N+]2CCC(OC(c3ccccc3)c3c...,0.926729
3,FLEJZHACHXCVCS-XMMPIXPASA-N,CC(C)COC[C@@H](CN(Cc1ccccc1)c1ccccc1)[N+]1CCCC1,0.631805
4,ZVPJDRRWMYGYKM-SPUOUPEWSA-N,C[N+]1[C@H]2CC[C@@H]1C[C@H](OC(=O)C(CO)c1ccccc...,0.858856


In [9]:
#set a threshold value to 0.5 
#convert the probability column to a binary prediction
predicted_output = (prediction['probability'] >= 0.5).astype(int)

# Extract the ACTIVITY column 
test_output= test['ACTIVITY'] 

In [10]:
# Calculate Matthews correlation coefficient
mcc = matthews_corrcoef(test_output, predicted_output)

# Calculate confusion matrix
conf_matrix = confusion_matrix(test_output, predicted_output)

# Calculate NPV
tn, fp, fn, tp = conf_matrix.ravel()
npv = tn / (tn + fn)

# Calculate accuracy
accuracy = accuracy_score(test_output, predicted_output)

# Calculate precision 
precision = precision_score(test_output, predicted_output)#represent PPV

# Calculate SPE
spe = tn / (tn + fp)

# Calculate recall
recall = recall_score(test_output, predicted_output)

# Calculate F1 score
f1 = f1_score(test_output, predicted_output)

# Calculate balanced accuracy
balanced_accuracy = balanced_accuracy_score(test_output, predicted_output)

In [11]:
print("Test set-II Result:")

# Print the results with three decimal places
print(f"MCC: {mcc:.3f}")
print(f"NPV: {npv:.3f}")
print(f"ACC: {accuracy:.3f}")
print(f"PPV: {precision:.3f}")
print(f"SPE: {spe:.3f}")
print(f"SEN: {recall:.3f}")
print(f"B-ACC: {balanced_accuracy:.3f}")

Test set-II Result:
MCC: 0.452
NPV: 0.947
ACC: 0.683
PPV: 0.455
SPE: 0.600
SEN: 0.909
B-ACC: 0.755


From the publication, this model is reproducible because I got the same result when I used the author's second dataset **"external_test_set_neg.csv"** which is test II set from the table below.
LINK TO THE TABLE: [here](https://jcheminf.biomedcentral.com/articles/10.1186/s13321-021-00541-z/tables/4)