In [47]:
import pandas as pd

In [49]:
df_train = pd.read_csv('datasets/train_new.csv')

In [50]:
df_train

Unnamed: 0.1,Unnamed: 0,sentences,entity_1_mention,entity_2_mention,type,kfold
0,0,and the pharmacokinetic interaction between <e...,a. officinarum,indomethacin,POSITIVE,0
1,1,. results: it was observed that there a signif...,commiphora myrrha,theophylline,POSITIVE,0
2,2,by taking advantage of the high protein bindin...,chan su,digoxin,POSITIVE,0
3,3,previous in vitro studies indicate that curcum...,piperine,ugt,POSITIVE,0
4,4,.63 ng/ml <e2> digoxin </e2>) increase in <e2>...,asian ginseng,digoxin,POSITIVE,0
...,...,...,...,...,...,...
333,333,. emblica) and terminalia bellerica (<e1> t. b...,t. bellerica,cisplatin,SPECULATIVE,4
334,334,previous in vitro studies indicate that <e1> c...,curcuminoids,cyp2c9,POSITIVE,4
335,335,also co-administration of <e1> gce </e1> (50 m...,gce,insulin,POSITIVE,4
336,336,<e1> ginkgo biloba </e1> potentiated the bleed...,ginkgo biloba,cilostazol,POSITIVE,4


In [51]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 338 entries, 0 to 337
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        338 non-null    int64 
 1   sentences         338 non-null    object
 2   entity_1_mention  338 non-null    object
 3   entity_2_mention  338 non-null    object
 4   type              338 non-null    object
 5   kfold             338 non-null    int64 
dtypes: int64(2), object(4)
memory usage: 16.0+ KB


In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

In [53]:
start_pipe = [
    ("tfidf", TfidfVectorizer(analyzer="word", stop_words="english",)),
]

knn_pipe = Pipeline([*start_pipe, ('knn', KNeighborsClassifier())])
svc_pipe = Pipeline([*start_pipe, ('svc', SVC())])
rf_pipe = Pipeline([*start_pipe, ('rf', RandomForestClassifier())])

In [54]:
type2id = {t: i for i, t in enumerate(df_train.type.unique())}
type2id

{'POSITIVE': 0, 'SPECULATIVE': 1, 'NEGATIVE': 2}

In [55]:
id2type = {i: t for t, i in type2id.items()}
id2type

{0: 'POSITIVE', 1: 'SPECULATIVE', 2: 'NEGATIVE'}

In [56]:
x_train = df_train['sentences']
y_train = df_train['type'].map(type2id)
y_train[:10]

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    1
Name: type, dtype: int64

In [57]:
x_train[:10]

0    and the pharmacokinetic interaction between <e...
1    . results: it was observed that there a signif...
2    by taking advantage of the high protein bindin...
3    previous in vitro studies indicate that curcum...
4    .63 ng/ml <e2> digoxin </e2>) increase in <e2>...
5    aids: <e1> st. john's wort </e1>, an herbal tr...
6    <e1> dehydrofelodipine </e1> auc (74.7 +/- 28....
7    our results clarified that oral administration...
8    other herbal remedies with the potential to mo...
9    subjects received <e1> atenolol </e1> with eit...
Name: sentences, dtype: object

In [58]:
# Train the model
## KNN
knn_pipe.fit(x_train, y_train)
knn_pipe.score(x_train, y_train)

0.8905325443786982

In [59]:
## SVC
svc_pipe.fit(x_train, y_train)
svc_pipe.score(x_train, y_train)

0.9881656804733728

In [67]:
## Random Forest
rf_pipe.fit(x_train, y_train)
rf_pipe.score(x_train, y_train)

0.9911242603550295

# K-Fold

In [80]:
knn_pipe_kfold = Pipeline([*start_pipe, ('knn', KNeighborsClassifier())])
svc_pipe_kfold = Pipeline([*start_pipe, ('svc', SVC())])
rf_pipe_kfold = Pipeline([*start_pipe, ('rf', RandomForestClassifier())])

In [108]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

acc = []
prec = []
rec = []
f1 = []

def getScores(estimator, x, y):
    yPred = estimator.predict(x)
    return (accuracy_score(y, yPred), 
            precision_score(y, yPred, average="macro"), 
            recall_score(y, yPred, average="macro"),
            f1_score(y, yPred, average="macro"))

def my_scorer(estimator, x, y):
    a, p, r, f = getScores(estimator, x, y)
    acc.append(a)
    prec.append(p)
    rec.append(r)
    f1.append(f)
    return a+p+r+f

In [109]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [110]:
## KNN
scores = cross_val_score(knn_pipe_kfold, x_train, y_train, cv=cv, scoring=my_scorer)
print("Accuracy: %0.2f (+/- %0.2f)" % (np.mean(acc), np.std(acc)))
print("Precision: %0.2f (+/- %0.2f)" % (np.mean(prec), np.std(prec)))
print("Recall: %0.2f (+/- %0.2f)" % (np.mean(rec), np.std(rec)))
print("F1: %0.2f (+/- %0.2f)" % (np.mean(f1), np.std(f1)))

Accuracy: 0.83 (+/- 0.02)
Precision: 0.79 (+/- 0.03)
Recall: 0.72 (+/- 0.02)
F1: 0.75 (+/- 0.02)


In [111]:
acc = []
prec = []
rec = []
f1 = []

## SVC
scores = cross_val_score(svc_pipe_kfold, x_train, y_train, cv=cv, scoring=my_scorer)
print("Accuracy: %0.2f (+/- %0.2f)" % (np.mean(acc), np.std(acc)))
print("Precision: %0.2f (+/- %0.2f)" % (np.mean(prec), np.std(prec)))
print("Recall: %0.2f (+/- %0.2f)" % (np.mean(rec), np.std(rec)))
print("F1: %0.2f (+/- %0.2f)" % (np.mean(f1), np.std(f1)))

Accuracy: 0.91 (+/- 0.02)
Precision: 0.95 (+/- 0.03)
Recall: 0.78 (+/- 0.04)
F1: 0.82 (+/- 0.04)


In [112]:
acc = []
prec = []
rec = []
f1 = []

## Random Forest
scores = cross_val_score(rf_pipe_kfold, x_train, y_train, cv=cv, scoring=my_scorer)
print("Accuracy: %0.2f (+/- %0.2f)" % (np.mean(acc), np.std(acc)))
print("Precision: %0.2f (+/- %0.2f)" % (np.mean(prec), np.std(prec)))
print("Recall: %0.2f (+/- %0.2f)" % (np.mean(rec), np.std(rec)))
print("F1: %0.2f (+/- %0.2f)" % (np.mean(f1), np.std(f1)))

Accuracy: 0.92 (+/- 0.03)
Precision: 0.92 (+/- 0.06)
Recall: 0.84 (+/- 0.08)
F1: 0.86 (+/- 0.07)


# Validation dataset

In [113]:
valid_df = pd.read_csv('datasets/valid_new.csv')
valid_df

Unnamed: 0.1,Unnamed: 0,sentences,entity_1_mention,entity_2_mention,type
0,0,the pharmacokinetics of tolbutamide in all vin...,tolbutamide,vinegar-baked radix bupleuri,NEGATIVE
1,1,".05) from that of controlled rats, however, tr...",radix bupleuri,tolbutamide,POSITIVE
2,2,". results: the auc and t1/2 of midazolam, dext...",midazolam,radix bupleuri,POSITIVE
3,3,". results: the auc and t1/2 of midazolam, dext...",midazolam,vinegar-baked radix bupleuri,POSITIVE
4,4,". results: the auc and t1/2 of midazolam, dext...",dextromethorphan,radix bupleuri,POSITIVE
5,5,". results: the auc and t1/2 of midazolam, dext...",dextromethorphan,vinegar-baked radix bupleuri,POSITIVE
6,6,". results: the auc and t1/2 of midazolam, dext...",chlorzoxazone,radix bupleuri,POSITIVE
7,7,". results: the auc and t1/2 of midazolam, dext...",chlorzoxazone,vinegar-baked radix bupleuri,POSITIVE
8,8,the reason of different therapeutic effects of...,radix bupleuri,cyp2c9,POSITIVE
9,9,the reason of different therapeutic effects of...,vinegar-baked radix bupleuri,cyp2c19,POSITIVE


In [114]:
x_test = valid_df['sentences']
y_test = valid_df['type'].map(type2id)

In [115]:
from sklearn.metrics import classification_report
# Predict
## KNN
print(classification_report(y_test, knn_pipe.predict(x_test)))

              precision    recall  f1-score   support

           0       0.74      0.50      0.60        28
           1       0.20      0.44      0.28         9
           2       1.00      0.50      0.67         4

    accuracy                           0.49        41
   macro avg       0.65      0.48      0.51        41
weighted avg       0.64      0.49      0.53        41



In [120]:
## SVC
print(classification_report(y_test, svc_pipe.predict(x_test)))

              precision    recall  f1-score   support

           0       0.68      1.00      0.81        28
           1       0.00      0.00      0.00         9
           2       0.00      0.00      0.00         4

    accuracy                           0.68        41
   macro avg       0.23      0.33      0.27        41
weighted avg       0.47      0.68      0.55        41



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [122]:
## Random Forest
print(classification_report(y_test, rf_pipe.predict(x_test)))

              precision    recall  f1-score   support

           0       0.72      1.00      0.84        28
           1       0.00      0.00      0.00         9
           2       1.00      0.50      0.67         4

    accuracy                           0.73        41
   macro avg       0.57      0.50      0.50        41
weighted avg       0.59      0.73      0.64        41



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Error analysis

In [124]:
rnd_predictions = rf_pipe.predict(x_test)

In [125]:
df_err_analysis = pd.DataFrame(columns=['sentence', 'type', 'prediction'])
df_err_analysis['sentence'] = valid_df['sentences']
df_err_analysis['type'] = valid_df['type']
df_err_analysis['prediction'] = rnd_predictions
df_err_analysis

Unnamed: 0,sentence,type,prediction
0,the pharmacokinetics of tolbutamide in all vin...,NEGATIVE,0
1,".05) from that of controlled rats, however, tr...",POSITIVE,0
2,". results: the auc and t1/2 of midazolam, dext...",POSITIVE,0
3,". results: the auc and t1/2 of midazolam, dext...",POSITIVE,0
4,". results: the auc and t1/2 of midazolam, dext...",POSITIVE,0
5,". results: the auc and t1/2 of midazolam, dext...",POSITIVE,0
6,". results: the auc and t1/2 of midazolam, dext...",POSITIVE,0
7,". results: the auc and t1/2 of midazolam, dext...",POSITIVE,0
8,the reason of different therapeutic effects of...,POSITIVE,0
9,the reason of different therapeutic effects of...,POSITIVE,0


In [127]:
df_err_analysis.prediction = df_err_analysis.prediction.map(id2type)
df_err_analysis

Unnamed: 0,sentence,type,prediction
0,the pharmacokinetics of tolbutamide in all vin...,NEGATIVE,POSITIVE
1,".05) from that of controlled rats, however, tr...",POSITIVE,POSITIVE
2,". results: the auc and t1/2 of midazolam, dext...",POSITIVE,POSITIVE
3,". results: the auc and t1/2 of midazolam, dext...",POSITIVE,POSITIVE
4,". results: the auc and t1/2 of midazolam, dext...",POSITIVE,POSITIVE
5,". results: the auc and t1/2 of midazolam, dext...",POSITIVE,POSITIVE
6,". results: the auc and t1/2 of midazolam, dext...",POSITIVE,POSITIVE
7,". results: the auc and t1/2 of midazolam, dext...",POSITIVE,POSITIVE
8,the reason of different therapeutic effects of...,POSITIVE,POSITIVE
9,the reason of different therapeutic effects of...,POSITIVE,POSITIVE


In [136]:
true_vals = {
    label: df_err_analysis[(df_err_analysis['type'] == df_err_analysis['prediction']) & (df_err_analysis['type'] == label)].shape[0] for label in type2id.keys()
}
true_vals

{'POSITIVE': 28, 'SPECULATIVE': 0, 'NEGATIVE': 2}

In [137]:
false_vals = {
    label: df_err_analysis[(df_err_analysis['type'] != df_err_analysis['prediction']) & (df_err_analysis['type'] == label)].shape[0] for label in type2id.keys()
}
false_vals

{'POSITIVE': 0, 'SPECULATIVE': 9, 'NEGATIVE': 2}

In [138]:
accuracy = {
    label: (true_vals[label] / (true_vals[label] + false_vals[label])) * 100 for label in type2id.keys()
}

accuracy

{'POSITIVE': 100.0, 'SPECULATIVE': 0.0, 'NEGATIVE': 50.0}

In [146]:
df_err_analysis[(df_err_analysis["type"] != df_err_analysis["prediction"]) & (df_err_analysis["type"] == "SPECULATIVE")]

Unnamed: 0,sentence,type,prediction
27,this paper is a review of possible adverse int...,SPECULATIVE,POSITIVE
28,this study was designed to explore the impact ...,SPECULATIVE,POSITIVE
29,this study was designed to explore the impact ...,SPECULATIVE,POSITIVE
31,to evaluate whether ape or andrographolide aff...,SPECULATIVE,POSITIVE
35,warfarin was the most common drug (18 cases) a...,SPECULATIVE,POSITIVE
36,we set out to study the interactions of g. bil...,SPECULATIVE,POSITIVE
37,we set out to study the interactions of g. bil...,SPECULATIVE,POSITIVE
38,we studied potential interference of ginseng i...,SPECULATIVE,POSITIVE
39,we studied potential interference of ginseng i...,SPECULATIVE,POSITIVE


In [147]:
df_err_analysis[(df_err_analysis["type"] != df_err_analysis["prediction"]) & (df_err_analysis["type"] == "NEGATIVE")]

Unnamed: 0,sentence,type,prediction
0,the pharmacokinetics of tolbutamide in all vin...,NEGATIVE,POSITIVE
33,treatment of radix bupleuri decreased t1/2 and...,NEGATIVE,POSITIVE


In [150]:
df_err_analysis[(df_err_analysis["type"] == df_err_analysis["prediction"]) & (df_err_analysis["type"] == "NEGATIVE")]

Unnamed: 0,sentence,type,prediction
17,. conclusions: coadministration of g. biloba e...,NEGATIVE,NEGATIVE
18,. conclusions: coadministration of g. biloba e...,NEGATIVE,NEGATIVE
