In [1]:
from models.tf_idf import TFIDFClassifier
from data_loader import AnnotationDataLoader
from model_evaluation import ModelEvalWrapper, ModelEvaluator
from model_evaluation import ModelEvalWrapper, ModelEvaluator, ModelEvalWrapperInterface, ModelReport, ReportsComparison


In [2]:
dataloader = AnnotationDataLoader()
final_test_set = dataloader.get_last_test_set_no_overlap()

In [3]:
tfidf = TFIDFClassifier()
tfidf_model = TFIDFClassifier.load_from_file("models/tf_idf/ComplementNaiveBayes_TF-IDF.pkl")
tfidf_model_wrapper = ModelEvalWrapper(tfidf_model, "ComplementNaiveBayes_TF-IDF")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [4]:
from models.fine_tuned_BERT_models.classifier import BERTBasedModel
from data_loader import AnnotationDataLoader

from train_llm.config import MODELS_DIR

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
dataloader = AnnotationDataLoader()

model_link = "Mila-MP/inPhormer-LLM-classifier"
subfolder = "final_llm"

In [6]:
llm = BERTBasedModel(model_dir=model_link, subfolder=subfolder)
llm_wrapper = ModelEvalWrapper(llm, "LLM, lr 2e-5, 5 epochs, no label smoothing")

In [7]:
from model_evaluation import ModelEvalWrapper, ModelEvaluator, ModelEvalWrapperInterface
from models.embedder_with_classification_head import EmbedderClassifier
from data_loader import AnnotationDataLoader
from models.embedder_with_classification_head.embedders import (
    SentenceTransformerEmbedder,
)
from models.embedder_with_classification_head.classification_heads import (
    GenericSklearnCalssifier,
)
from sklearn.neural_network import MLPClassifier


In [8]:
config = {
    "solver": "adam",
    "random_state": 42,
    "max_iter": 500,
    "learning_rate_init": 0.0001,
    "hidden_layer_sizes": (64, 32),
    "early_stopping": False,
    "alpha": 0.01,
    "activation": "relu",
}

embcls_model = EmbedderClassifier.load_model()
embcls_model.model_info = str(config)
embcls_model.model_title = "Pretrained embedder"

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [10]:
from models.regex import RegexModel

# Load the regex model and the dataloader 
regex_m = RegexModel()

## Evaluate model
regex_m_wrapper = ModelEvalWrapper(regex_m, "Regex", "The baseline regex classifer")

In [11]:
model_eval_tfidf = ModelEvaluator(tfidf_model_wrapper, final_test_set)
model_eval_llm = ModelEvaluator(llm_wrapper, final_test_set)
model_eval_embcls = ModelEvaluator(embcls_model, final_test_set)
model_eval_regex = ModelEvaluator(regex_m_wrapper, final_test_set)


In [12]:
tf_idf_report = model_eval_tfidf.generate_report()
llm_report = model_eval_llm.generate_report()
clsemb_report = model_eval_embcls.generate_report()
regex_report = model_eval_regex.generate_report()

In [13]:
cmp = ReportsComparison([tf_idf_report, llm_report, clsemb_report, regex_report])

In [14]:
cmp.show_comparison()

In [15]:
import requests
 # settings for requests
sess = requests.Session()
adapter = requests.adapters.HTTPAdapter(max_retries = 10)
sess.mount("https://", adapter)

def get_protein_name_string(uniprot_id):
    # note: unsure if protein_name is a required field, if we get errors, look into this!
    url = f"https://rest.uniprot.org/uniprotkb/search?query={uniprot_id}&fields=protein_name&format=tsv"
    r = sess.get(url)
    r.raise_for_status()
    content = r.text
    names = content.split('\n')[1:-1]
    # if the UniProt entry was marked as obsolete, access its UniParc accession
    return ";".join([str(name) for name in names])

uniprotIDs = ["A0A1B0GTW7","A5D8V7","A2AJK6","A2AKQ0","A1A6M1","A2AVM0","A0A0G2JTY4","A0A0G2JZ79","A0JN40","A0JN61",
                "Q96LX7","Q8N8F7","Q93YR3","Q67XQ0","Q9D3L0","Q6NZQ0","Q55ED4","P11022","P32857","Q02981",
                "A0A5S9XQF2","Q9FGG0","B9FA11","B7EAC4","A0A0B4KH77","Q9VY08","Q54K29","Q54H65","B3DJF1","F1QHG6"]

annotations = [get_protein_name_string(uniprot_id) for uniprot_id in uniprotIDs]
prop = annotations[:10]
low = annotations[10:20]
un = annotations[20:]

In [16]:
annotations[10:20]

['Coiled-coil domain-containing protein 17',
 'Leucine-rich single-pass membrane protein 1',
 'FAM10 family protein At4g22670',
 'DUF21 domain-containing protein At4g14240 (CBS domain-containing protein CBSDUF1)',
 'Membrane protein FAM174A (Transmembrane protein 157)',
 'Cilia- and flagella-associated protein 119 (Coiled-coil domain-containing protein 189)',
 'NKAP family protein',
 'Membrane protein P8A7',
 'Membrane protein PTM1',
 'ABC1 family protein YPL109C, mitochondrial']

In [17]:
annotations[20:]

['(thale cress) hypothetical protein',
 'Similarity to unknown protein',
 'Uncharacterized protein',
 '(RAP Annotation release2) Hypothetical protein',
 'Uncharacterized protein, isoform D (EC 3.4.11.-)',
 'FI02019p (Uncharacterized protein, isoform A) (Uncharacterized protein, isoform B) (Uncharacterized protein, isoform C (EC 1.-.-.-, EC 1.1.-.-))',
 'Uncharacterized protein',
 'Uncharacterized protein',
 'Hypothetical LOC561073 (Uncharacterized protein LOC561073 precursor) (Zgc:194981)',
 'Im:7138535 (Uncharacterized protein LOC797998)']

In [18]:
print(llm.predict(un))
print(llm.predict(low))
print(llm.predict(prop))

[2 0 0 0 0 0 0 0 0 0]
[2 1 1 1 1 2 2 2 2 2]
[2 2 2 2 2 2 2 2 2 2]


In [19]:
from faker import Faker
Faker.seed(42)
random_sentences = []
for i in range(30):
    random_sentences.append(Faker().sentence(nb_words=5, variable_nb_words=True))
cleaned_random = [s[:-1] for s in random_sentences]  # remove final period


In [20]:
cleaned_random

['Agent every development',
 'Quality throughout beautiful',
 'All behavior discussion own night',
 'Information last everything thank serve civil',
 'Future choice whatever from behavior',
 'Grow gas enough',
 'Role movie win',
 'Bad fall pick those gun',
 'Relate animal direction eye',
 'Talk term herself',
 'Player half have decide environment',
 'Participant commercial rock',
 'Left establish understand read',
 'Range successful simply director',
 'Source husband at',
 'Then fire pretty how trip learn',
 'Cause seat much section',
 'Today human despite young meeting',
 'Sense technology check',
 'Society instead as high',
 'More wife team activity result',
 'Seem shoulder future fall citizen about',
 'Will seven medical blood',
 'Fear police participant check several much single',
 'Truth out major born guy world',
 'Dream drive note bad',
 'Staff within mouth call process water',
 'Enter their institution deep',
 'Sense ready require human public health',
 'Later easy ask again ne

In [21]:
llm.predict(cleaned_random)

array([0, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 0, 2, 0])