In [1]:
from models.tf_idf import TFIDFClassifier
from data_loader import AnnotationDataLoader

In [2]:
tfidf = TFIDFClassifier()
dataloader = AnnotationDataLoader()

tfidf.fit(dataloader.get_train_all())

In [3]:
tfidf.save_to_file("models/tf_idf/ComplementNaiveBayes_TF-IDF.pkl")

In [4]:
new = TFIDFClassifier.load_from_file("models/tf_idf/ComplementNaiveBayes_TF-IDF.pkl")

In [5]:
dataloader = AnnotationDataLoader()
df = dataloader.get_test()
df["protein_annotation"]
print(new.predict(df["protein_annotation"], probabilities=True))

[[1.77598757e-06 1.99901866e-06 9.99996225e-01]
 [7.54897237e-04 1.04629703e-03 9.98198806e-01]
 [9.99527398e-01 4.53029986e-04 1.95718574e-05]
 ...
 [1.37234170e-04 1.48670808e-04 9.99714095e-01]
 [3.51192432e-03 5.59269694e-03 9.90895379e-01]
 [3.52429788e-04 3.98664986e-04 9.99248905e-01]]


In [6]:
new.classifier_parameters

{'alpha': 0.002783,
 'class_prior': None,
 'fit_prior': True,
 'force_alpha': True,
 'norm': False}

In [7]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [8]:
mnb = MultinomialNB(alpha=0.001)
rf = RandomForestClassifier(
        bootstrap=False,
        class_weight='balanced',
        max_depth=20,
        max_features='log2',
        min_samples_leaf=2,
        min_samples_split=18,
        n_estimators=403,
        n_jobs=-1   
    )
svc = SVC(C = 2.1544, class_weight='balanced', gamma='scale', kernel='rbf', probability=True)
lr = LogisticRegression(solver="saga", max_iter=1000, penalty='l2', C=17.0798, class_weight=None)

In [9]:
mnb = TFIDFClassifier(classifier=mnb)
rf = TFIDFClassifier(classifier=rf)
svc = TFIDFClassifier(classifier=svc)
lr = TFIDFClassifier(classifier=lr)

In [10]:
mnb.fit(dataloader.get_train_all())
rf.fit(dataloader.get_train_all())
svc.fit(dataloader.get_train_all())
lr.fit(dataloader.get_train_all())

In [12]:
mnb.save_to_file("models/tf_idf/MultinomialNaiveBayes_TF-IDF.pkl")
rf.save_to_file("models/tf_idf/RandomForest_TF-IDF.pkl")
svc.save_to_file("models/tf_idf/SupportVectorClassification_TF-IDF.pkl")
lr.save_to_file("models/tf_idf/LogisticRegression_TF-IDF.pkl")


In [13]:
mnb = TFIDFClassifier.load_from_file("models/tf_idf/MultinomialNaiveBayes_TF-IDF.pkl")
rf = TFIDFClassifier.load_from_file("models/tf_idf/RandomForest_TF-IDF.pkl")
svc = TFIDFClassifier.load_from_file("models/tf_idf/SupportVectorClassification_TF-IDF.pkl")
lr = TFIDFClassifier.load_from_file("models/tf_idf/LogisticRegression_TF-IDF.pkl")

In [14]:
mnb.predict(df["protein_annotation"])

array([2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2,
       2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 0,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 0, 0, 2, 2, 2, 0, 2,
       2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 2, 2, 2, 1,
       2, 2, 2, 1, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0,
       2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2,
       2, 0, 2, 2, 0, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 0,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2,
       2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2,