In [1]:
! pip install transformers



In [2]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import pipeline

classifier = pipeline("text-classification", model="as-cle-bert/resistBERT")
text = "MMMTKSLCCALLLSTSCSVLATPMSEKQLAEVVERTVTPLMKAQAIPGMAVAVIYEGQPHYFTFGKADVAANKPVTPQTLFELGSISKTFTGVLGGDAIARGEISLGDPVTKYWPELTGKQWQGIRMLDLATYTAGGLPLQVPDEVTDNASLLRFYQNWQPQWKPGTTRLYANASIGLFGALAVKPSGMSYEQAITTRVFKPLKLDHTWINVPKAEEAHYAWGYRDGKAVHVSPGMLDAEAYGVKTNVQDMASWVMVNMKPDSLQDNSLRKGLTLAQSRYWRVGAMYQGLGWEMLNWPVDAKTVVEGSDNKVALAPLPAREVNPPAPPVNASWVHKTGSTGGFGSYVAFIPEKQLGIVMLANKSYPNPARVEAAYRILSAL"
text = " ".join([char for char in text])
results = classifier(text)
true_label = "ACT beta-lactamase"
print(results)
print(true_label)

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.36k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

[{'label': 'ACT beta-lactamase', 'score': 0.9020795226097107}]
ACT beta-lactamase


In [3]:
import json
import os

def df_from_listofdicts(listofdifcs: list):
    refdict = listofdifcs[0]
    keys = list(refdict.keys())
    newdict = {
        key: [listofdifcs[i][key] for i in range(len(listofdifcs))] for key in keys
    }
    return newdict

def parse_jsonl(jsonf: str):
    jsonlist = []
    for line in jsonf:
        jsonobj = json.loads(line)
        jsonlist.append(jsonobj)
    return jsonlist

id2label = {"PDC beta-lactamase": 0, "CTX-M beta-lactamase": 1, "SHV beta-lactamase": 2, "CMY beta-lactamase": 3, "resistance-nodulation-cell division (RND) antibiotic efflux pump": 4, "major facilitator superfamily (MFS) antibiotic efflux pump": 5, "quinolone resistance protein (qnr)": 6, "IMP beta-lactamase": 7, "KPC beta-lactamase": 8, "ACT beta-lactamase": 9, "MCR phosphoethanolamine transferase": 10, "VIM beta-lactamase": 11}

tests = {os.path.join("/content/drive/MyDrive/test_resistBERT", f): {} for f in os.listdir("/content/drive/MyDrive/test_resistBERT")}
for f in list(tests.keys()):
    jsonf = open(f, "r")
    hugelist = parse_jsonl(jsonf)
    tests[f] = df_from_listofdicts(hugelist)
    tests[f]["label"] = [id2label[label] for label in tests[f]["label"]]
    jsonf.close()

print(tests[list(tests.keys())[0]]["text"][:10])
print(tests[list(tests.keys())[0]]["label"][:10])

['MTLALVGEKIDRNRFTGVKVENSTFFNCDFSGTDLSGTEFIGCQFYDRESQKGCNFSRAILKDAIFKSCDLSMADFRNASALGIEIRHCRAQGSDFRGASFMNMITTRTWFCSAYITNTNLSYANFSKVVLEKCELWENRWMGTQVLGATFSGSDLSGGEFSSFDWRAANFTHCDLTNSELGDLDVRGVDLQGVKLDSYQASLILERLGIAVIG', 'MTLALVGEKIDRNRFTGEKVENSTFFNCDFSGADLSGTEFIGCQFYDRESQKGCNFSRAILKDAIFKSCDLSMADFRNVSALGIEIRHCRAQGADFRGASFMNMITTRTWFCSAYITNTNLSYANFSKAVLEKCELWENRWMGTQVLGATLSGSDLSGGEFSSFDWRTANFTHCDLTNSELGDLDIRGVDLQGVKLDNYQAALLMERLGIAVIG', 'MALALIGEKIDRNRFTGEKVENSTFFNCDFSGADLSGTEFIGCQFYDRESQKGCNFSRAILKDAIFKSCDLSMADFRNVSALGIEIRHCRAQGADFRGASFMNMITTRTWFCSAYITNTNLSYANFSKAVLEKCELWENRWMGTQVLGATLSGSDLSGGEFSSFDWRTANFTHCDLTNSELGDLDIRGVDLQGVKLDSYQAVLLMERLGIAVIG', 'MTLALVGEKIDRNRFTGEKVENSTFFNCDFSGADLSGTEFIGCQFYDRESQKGCNFSRAMLKDAIFKSCDLSMADFRNVSALGIEIRHCRAQGADFRGASFMNMITTRTWFCSAYITNTNLSYANFSKVVLEKCELWENRWMGTQVMGATFSGSDLSGGEFSTFDWRAANFTHCDLTNSELGDLDIRGVDLQGVKLDNYQASLLMERLGIAVIG', 'METYNHTYRHHNFSHKDLSDLTFTACTFIRSDFRRANLRDTTFVNCKFIEQGDIEGCHFDVADLHDASFQQCQLAMANFSNANCYGIEFRACDLKGANFSRTNFAHQVSNRMYFCSAFISGCNLSY

In [4]:
def pred_class(classifier, text: str, id2label: dict):
  text = " ".join([char for char in text])
  results = classifier(text)
  scores = []
  labels = []
  for result in results:
    scores.append(result["score"])
    labels.append(result["label"])
  return id2label[labels[scores.index(max(scores))]]

In [5]:
y_pred = []
y_true = tests[list(tests.keys())[5]]["label"]
for text in tests[list(tests.keys())[5]]["text"]:
    y_pred.append(pred_class(classifier, text, id2label))
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, labels=list(set(y_true)), average='weighted')
precision = precision_score(y_true, y_pred, labels=list(set(y_true)), average='weighted')
recall = recall_score(y_true, y_pred, labels=list(set(y_true)), average='weighted')
print(f"TEST ON {list(tests.keys())[5]}\n-Accuracy: {accuracy:.4f}\n-f1 score: {f1:.4f}\n-Precision: {precision:.4f}\n-Recall: {recall:.4f}")

TEST ON /content/drive/MyDrive/test_resistBERT/test_0.jsonl
-Accuracy: 1.0000
-f1 score: 1.0000
-Precision: 1.0000
-Recall: 1.0000


In [6]:
y_pred = []
y_true = tests[list(tests.keys())[2]]["label"]
for text in tests[list(tests.keys())[2]]["text"]:
    y_pred.append(pred_class(classifier, text, id2label))
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, labels=list(set(y_true)), average='weighted')
precision = precision_score(y_true, y_pred, labels=list(set(y_true)), average='weighted')
recall = recall_score(y_true, y_pred, labels=list(set(y_true)), average='weighted')
print(f"TEST ON {list(tests.keys())[2]}\n-Accuracy: {accuracy:.4f}\n-f1 score: {f1:.4f}\n-Precision: {precision:.4f}\n-Recall: {recall:.4f}")

TEST ON /content/drive/MyDrive/test_resistBERT/test_1.jsonl
-Accuracy: 0.5000
-f1 score: 0.3333
-Precision: 0.2500
-Recall: 0.5000


  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
y_pred = []
y_true = tests[list(tests.keys())[0]]["label"]
for text in tests[list(tests.keys())[0]]["text"]:
    y_pred.append(pred_class(classifier, text, id2label))
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, labels=list(set(y_true)), average='weighted')
precision = precision_score(y_true, y_pred, labels=list(set(y_true)), average='weighted')
recall = recall_score(y_true, y_pred, labels=list(set(y_true)), average='weighted')
print(f"TEST ON {list(tests.keys())[0]}\n-Accuracy: {accuracy:.4f}\n-f1 score: {f1:.4f}\n-Precision: {precision:.4f}\n-Recall: {recall:.4f}")

TEST ON /content/drive/MyDrive/test_resistBERT/test_2.jsonl
-Accuracy: 1.0000
-f1 score: 1.0000
-Precision: 1.0000
-Recall: 1.0000


In [8]:
y_pred = []
y_true = tests[list(tests.keys())[1]]["label"]
for text in tests[list(tests.keys())[1]]["text"]:
    y_pred.append(pred_class(classifier, text, id2label))
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, labels=list(set(y_true)), average='weighted')
precision = precision_score(y_true, y_pred, labels=list(set(y_true)), average='weighted')
recall = recall_score(y_true, y_pred, labels=list(set(y_true)), average='weighted')
print(f"TEST ON {list(tests.keys())[1]}\n-Accuracy: {accuracy:.4f}\n-f1 score: {f1:.4f}\n-Precision: {precision:.4f}\n-Recall: {recall:.4f}")

TEST ON /content/drive/MyDrive/test_resistBERT/test_3.jsonl
-Accuracy: 0.6667
-f1 score: 0.6667
-Precision: 0.6667
-Recall: 0.6667


  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
y_pred = []
y_true = tests[list(tests.keys())[3]]["label"]
for text in tests[list(tests.keys())[3]]["text"]:
    y_pred.append(pred_class(classifier, text, id2label))
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, labels=list(set(y_true)), average='weighted')
precision = precision_score(y_true, y_pred, labels=list(set(y_true)), average='weighted')
recall = recall_score(y_true, y_pred, labels=list(set(y_true)), average='weighted')
print(f"TEST ON {list(tests.keys())[3]}\n-Accuracy: {accuracy:.4f}\n-f1 score: {f1:.4f}\n-Precision: {precision:.4f}\n-Recall: {recall:.4f}")

TEST ON /content/drive/MyDrive/test_resistBERT/test_4.jsonl
-Accuracy: 1.0000
-f1 score: 1.0000
-Precision: 1.0000
-Recall: 1.0000


In [10]:
y_pred = []
y_true = tests[list(tests.keys())[4]]["label"]
for text in tests[list(tests.keys())[4]]["text"]:
    y_pred.append(pred_class(classifier, text, id2label))
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, labels=list(set(y_true)), average='weighted')
precision = precision_score(y_true, y_pred, labels=list(set(y_true)), average='weighted')
recall = recall_score(y_true, y_pred, labels=list(set(y_true)), average='weighted')
print(f"TEST ON {list(tests.keys())[4]}\n-Accuracy: {accuracy:.4f}\n-f1 score: {f1:.4f}\n-Precision: {precision:.4f}\n-Recall: {recall:.4f}")

TEST ON /content/drive/MyDrive/test_resistBERT/test_5.jsonl
-Accuracy: 0.5000
-f1 score: 0.5000
-Precision: 0.5000
-Recall: 0.5000


  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
y_pred = []
y_true = tests[list(tests.keys())[9]]["label"]
for text in tests[list(tests.keys())[9]]["text"]:
    y_pred.append(pred_class(classifier, text, id2label))
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, labels=list(set(y_true)), average='weighted')
precision = precision_score(y_true, y_pred, labels=list(set(y_true)), average='weighted')
recall = recall_score(y_true, y_pred, labels=list(set(y_true)), average='weighted')
print(f"TEST ON {list(tests.keys())[9]}\n-Accuracy: {accuracy:.4f}\n-f1 score: {f1:.4f}\n-Precision: {precision:.4f}\n-Recall: {recall:.4f}")

TEST ON /content/drive/MyDrive/test_resistBERT/test_6.jsonl
-Accuracy: 1.0000
-f1 score: 1.0000
-Precision: 1.0000
-Recall: 1.0000


In [12]:
y_pred = []
y_true = tests[list(tests.keys())[7]]["label"]
for text in tests[list(tests.keys())[7]]["text"]:
    y_pred.append(pred_class(classifier, text, id2label))
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, labels=list(set(y_true)), average='weighted')
precision = precision_score(y_true, y_pred, labels=list(set(y_true)), average='weighted')
recall = recall_score(y_true, y_pred, labels=list(set(y_true)), average='weighted')
print(f"TEST ON {list(tests.keys())[7]}\n-Accuracy: {accuracy:.4f}\n-f1 score: {f1:.4f}\n-Precision: {precision:.4f}\n-Recall: {recall:.4f}")

TEST ON /content/drive/MyDrive/test_resistBERT/test_7.jsonl
-Accuracy: 1.0000
-f1 score: 1.0000
-Precision: 1.0000
-Recall: 1.0000


In [13]:
y_pred = []
y_true = tests[list(tests.keys())[8]]["label"]
for text in tests[list(tests.keys())[8]]["text"]:
    y_pred.append(pred_class(classifier, text, id2label))
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, labels=list(set(y_true)), average='weighted')
precision = precision_score(y_true, y_pred, labels=list(set(y_true)), average='weighted')
recall = recall_score(y_true, y_pred, labels=list(set(y_true)), average='weighted')
print(f"TEST ON {list(tests.keys())[8]}\n-Accuracy: {accuracy:.4f}\n-f1 score: {f1:.4f}\n-Precision: {precision:.4f}\n-Recall: {recall:.4f}")

TEST ON /content/drive/MyDrive/test_resistBERT/test_8.jsonl
-Accuracy: 1.0000
-f1 score: 1.0000
-Precision: 1.0000
-Recall: 1.0000


In [14]:
y_pred = []
y_true = tests[list(tests.keys())[6]]["label"]
for text in tests[list(tests.keys())[6]]["text"]:
    y_pred.append(pred_class(classifier, text, id2label))
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, labels=list(set(y_true)), average='weighted')
precision = precision_score(y_true, y_pred, labels=list(set(y_true)), average='weighted')
recall = recall_score(y_true, y_pred, labels=list(set(y_true)), average='weighted')
print(f"TEST ON {list(tests.keys())[6]}\n-Accuracy: {accuracy:.4f}\n-f1 score: {f1:.4f}\n-Precision: {precision:.4f}\n-Recall: {recall:.4f}")

TEST ON /content/drive/MyDrive/test_resistBERT/test_9.jsonl
-Accuracy: 1.0000
-f1 score: 1.0000
-Precision: 1.0000
-Recall: 1.0000
