In [33]:
from common.logging import create_logger
from pathlib import Path
import pandas as pd
from datasets import single_label_multiclass_annotated_study_design, enrich_annotations, annotations_with_specter_embeddings

from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
from sklearn import svm
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.model_selection import cross_validate
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:

data_dir = Path('/media/wwymak/Storage/coronawhy/nlp_datasets')
annotations_filepath = (data_dir / 'cord19_study_design_labelled' / 'design.csv')
metadata_filepath   = data_dir/'metadata.csv.zip'
processed_article_folder = data_dir / 'v8'/ 'processed_text'
annotations_raw = single_label_multiclass_annotated_study_design(annotations_filepath, metadata_filepath)
annotations_with_specter_df = annotations_with_specter_embeddings(annotations_raw,data_dir/ 'cord_19_specter_embeddings_4_17.csv.zip' )

In [9]:
annotations_with_specter_df.columns

Index(['sha', 'cord_uid', 'label', 'title', 'abstract', 'label_string', '0',
       '1', '2', '3',
       ...
       '758', '759', '760', '761', '762', '763', '764', '765', '766', '767'],
      dtype='object', length=774)

In [28]:
LogisticRegression().__class__

sklearn.linear_model._logistic.LogisticRegression

In [23]:
cv_scores = cross_validate(LogisticRegression(max_iter=5000), 
                           annotations_with_specter_df.drop(columns=['sha', 'cord_uid', 'label', 'title', 'abstract', 'label_string']),
    annotations_with_specter_df.label, cv=5, scoring=('accuracy', 'f1_macro'))



In [24]:
cv_scores

{'fit_time': array([1.05824304, 4.13299084, 3.37469125, 1.27372766, 0.9546032 ]),
 'score_time': array([0.00161362, 0.00162554, 0.00158596, 0.00160217, 0.00158358]),
 'test_accuracy': array([0.75      , 0.75428571, 0.75428571, 0.73714286, 0.70285714]),
 'test_f1_macro': array([0.69081523, 0.72047301, 0.74266748, 0.69178386, 0.56877695])}

In [34]:
models = [ExtraTreesClassifier(n_estimators=500),RandomForestClassifier(n_estimators=500), LogisticRegression(max_iter=5000),
         svm.SVC(kernel='linear'),svm.SVC(kernel='rbf', gamma=0.7)]
for model in models:
    cv_scores = cross_validate(model, 
                       annotations_with_specter_df.drop(columns=['sha', 'cord_uid', 'label', 'title', 'abstract', 'label_string']),
                    annotations_with_specter_df.label, cv=5, scoring=('accuracy', 'f1_macro'))  
    print(model.__class__, f"f1 macro avg: {np.mean(cv_scores['test_f1_macro'])}", f"accuracy: {np.mean(cv_scores['test_accuracy'])}")
          



<class 'sklearn.ensemble._forest.ExtraTreesClassifier'> f1 macro avg: 0.518755261778402 accuracy: 0.6506363636363636




<class 'sklearn.ensemble._forest.RandomForestClassifier'> f1 macro avg: 0.4890067388382688 accuracy: 0.6483701298701299




<class 'sklearn.linear_model._logistic.LogisticRegression'> f1 macro avg: 0.6829033061010075 accuracy: 0.7397142857142858




<class 'sklearn.svm._classes.SVC'> f1 macro avg: 0.7032135565882054 accuracy: 0.7397012987012987




<class 'sklearn.svm._classes.SVC'> f1 macro avg: 0.04520401096163428 accuracy: 0.2762532467532467


In [35]:
# final model
import joblib
clf = LogisticRegression(max_iter=5000).fit(annotations_with_specter_df.drop(columns=['sha', 'cord_uid', 'label', 'title', 'abstract', 'label_string']),
    annotations_with_specter_df.label)
joblib.dump(clf, data_dir/'logistic_specter_embedddings_2may.pkl', compress=3)

['/media/wwymak/Storage/coronawhy/nlp_datasets/logistic_specter_embedddings_2may.pkl']