In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
import pandas as pd
import numpy as np

import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from metric_learn import NCA
from sklearn.model_selection import cross_val_score, train_test_split

from sklearn.svm import SVC

from gensim.models import FastText
from gensim.test.utils import common_texts
from tpot import TPOTClassifier

import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append(f'../../')

from src.data.sentence_vectorizer import SentenceVectorizer

In [None]:
labels = pd.read_csv('../../data/interim/meddra_codes_terms_synonims.csv')
labels = labels['CODE']
meddra_labels = {v:k for k, v in enumerate(labels.unique())}

results = {
    'vectorizer': [],
    'train_model': [],
    'smm4h21': [],
    'smm4h17': [],
    'psytar': [],
    'cadec': [],
}

SIZE = 100
sv = SentenceVectorizer()
for vectorizer_name in sv.get_availables_vectorizers():
    print(f"vectorizer: {vectorizer_name}")
    results['vectorizer'] += [vectorizer_name] * 4
    path = '../../data/interim/'
    for name_train in os.listdir(path):

        if name_train not in ['smm4h17', 'smm4h21', 'psytar', 'cadec']:
            continue
        print(f"work with {name_train}")
        results['train_model'].append(name_train)

        folder = os.path.join(path, name_train)
        corpus_train = folder + '/train.csv'
        train = pd.read_csv(corpus_train)
        
        train = sv.vectorize(train, vectorizer_name=vectorizer_name) 

        X_train, y_train = train['term_vec'], train['code']
        X_train = pd.DataFrame([pd.Series(x) for x in X_train])
        y_train = y_train.apply(lambda x: int(meddra_labels[x]))

        #clf = make_pipeline(NCA(), SVC(gamma='scale'))
        clf = SVC(kernel='poly', gamma='scale')
        clf.fit(X_train, y_train)

        for name_test in os.listdir(path):
            if name_test not in ['smm4h17', 'smm4h21', 'psytar', 'cadec']:
                continue
            folder = os.path.join(path, name_test)
            corpus_train = folder + '/train.csv'
            corpus_test = folder + '/test.csv'
            train, test = pd.read_csv(corpus_train)[:1], pd.read_csv(corpus_test)
#            sv = SentenceVectorizer()
            _, test = sv.vectorize(train, test, vectorizer_name=vectorizer_name) 
            X_test, y_test = test['term_vec'], test['code']
            X_test = pd.DataFrame([pd.Series(x) for x in X_test])
            y_test = y_test.apply(lambda x: int(meddra_labels[x]))

            score = clf.score(X_test, y_test)
            print(f'\ttest with {name_test} score:', score)
            results[name_test].append(score)
        print()
    
results = pd.DataFrame(results)
results

vectorizer: sent2vec
work with smm4h21
	test with smm4h21 score: 0.03498542274052478
	test with smm4h17 score: 0.0012004801920768306
	test with psytar score: 0.0
	test with cadec score: 0.00340522133938706

work with smm4h17
	test with smm4h21 score: 0.011661807580174927
	test with smm4h17 score: 0.22488995598239295
	test with psytar score: 0.08395989974937343
	test with cadec score: 0.015891032917139614

work with psytar
	test with smm4h21 score: 0.008746355685131196
	test with smm4h17 score: 0.08323329331732693
	test with psytar score: 0.14661654135338345
	test with cadec score: 0.0022701475595913734

work with cadec
	test with smm4h21 score: 0.0
	test with smm4h17 score: 0.023609443777511004
	test with psytar score: 0.0012531328320802004
	test with cadec score: 0.13847900113507378

vectorizer: fasttext
work with smm4h21
	test with smm4h21 score: 0.27988338192419826
	test with smm4h17 score: 0.125250100040016
	test with psytar score: 0.09147869674185463
	test with cadec score: 0.1066

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
100%|██████████| 1369/1369 [03:32<00:00,  6.43it/s]
100%|██████████| 343/343 [00:53<00:00,  6.40it/s]
Some layers from the model checkpoint at bert-base-uncased we

	test with smm4h21 score: 0.19825072886297376


Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
100%|██████████| 1/1 [00:00<00:00,  8.58it/s]
 21%|██        | 514/2499 [01:00<03:50,  8.60it/s]

In [5]:
results

Unnamed: 0,vectorizer,train_model,smm4h21,smm4h17,psytar,cadec
0,sent2vec,smm4h21,0.034985,0.0012,0.0,0.003405
1,sent2vec,smm4h17,0.011662,0.22489,0.08396,0.015891
2,sent2vec,psytar,0.008746,0.083233,0.146617,0.00227
3,sent2vec,cadec,0.0,0.023609,0.001253,0.138479
4,fasttext,smm4h21,0.279883,0.12525,0.091479,0.106697
5,fasttext,smm4h17,0.16035,0.694678,0.20802,0.203178
6,fasttext,psytar,0.125364,0.295718,0.4599,0.177072
7,fasttext,cadec,0.110787,0.138856,0.130326,0.515323
8,bert,smm4h21,0.198251,0.046018,0.039861,0.041045
9,bert,smm4h17,0.023324,0.738295,0.060659,0.021144


In [None]:
# generations = 2
# population_size = 50
# max_eval_time_mins = 2
# n_jobs = 10
# max_iter = 10

# tpot = TPOTClassifier(generations=generations, 
#                       population_size=population_size,
#                       verbosity=2, 
#                       random_state=42, 
#                       max_eval_time_mins=max_eval_time_mins, 
#                       n_jobs=n_jobs)

# tpot.fit(X_train, y_train)
# score = tpot.score(X_test, y_test)
# score