In [1]:
import pickle
import pandas as pd
import numpy as np
from copy import copy
import tensorflow as tf
from sklearn.model_selection import train_test_split

import pymedtermino
import seaborn as sns

from metric_learn import NCA
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier

from tpot import TPOTClassifier

import warnings
warnings.filterwarnings("ignore")

In [2]:
import logging
logging.basicConfig(level = logging.DEBUG, filename = "TPOT_examples.log")
logging.debug("debug")

In [3]:
generations = 2
population_size = 50
max_eval_time_mins = 2
n_jobs = 10
max_iter = 10

In [1]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
X, y = make_classification(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=0)
pipe = Pipeline([('scaler', StandardScaler()), ('svc', SVC())])
# The pipeline can be used as any other estimator
# and avoids leaking the test set into the train set
pipe.fit(X_train, y_train)

pipe.score(X_test, y_test)

0.88

In [3]:
pipe['svc'].get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [4]:
df = pd.read_csv('../data/interim/meddra_data.csv')

In [5]:
df_huge = pd.read_csv('../data/interim/terms_and_codes.csv')
df_huge = df_huge[['term', 'code']].rename(columns={'term': 'text', 'code': 'meddra'})
df_huge['meddra'] = df_huge['meddra'].apply(lambda x: int(x) if x.isdigit() else None)
df_huge = df_huge[df_huge['meddra'].isin(df['meddra'])]

In [6]:
# df.to_csv('../data/interim/simple_data.csv', index=False)
# df_huge.to_csv('../data/interim/rich_data.csv', index=False)

In [7]:
df

Unnamed: 0,text,meddra
0,can't go out in the sun,10034972
1,waking up after the longest dream,10041349
2,eventful movie night in your dreams,10000125
3,may NOT switch your brain off,10064805
4,sleep for the next 2 days,10020765
...,...,...
2687,flaring,10010264
2688,flares,10010264
2689,flare,10010264
2690,fistulas,10016717


# Pure meddra

In [8]:
df = pd.read_csv('../data/interim/meddra_data.csv')
meddra_labels = {v:k for k, v in enumerate(df['meddra'].unique())}
df['meddra_label'] = df['meddra'].apply(lambda x: int(meddra_labels[x]))
train, test = train_test_split(df, test_size=0.20)

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(train['text'])

train['text_tokenized'] = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(train['text']), maxlen=7).tolist()
test['text_tokenized'] = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(test['text']), maxlen=7).tolist()
train['counts'] = train['meddra_label'].apply(lambda x: train[train['meddra_label']==x].shape[0])
#train = train[train['counts']>50]

X_train = np.array(train['text_tokenized'].to_list())
y_train = np.array(train['meddra_label'].to_list())

X_test = np.array(test['text_tokenized'].to_list())
y_test = np.array(test['meddra_label'].to_list())

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((2153, 7), (2153,), (539, 7), (539,))

In [9]:
y_test

array([ 51,  54,   5, 306,  17,  57,  62, 143, 163, 181,   5, 102, 372,
       144,  20, 145,  20,  34, 120,  57, 205,  39,  24,  40,  10,   2,
       102, 276, 124, 245,   5, 379, 371,   2,   5, 189,  16, 175,  40,
        69, 230,   5,  37, 154,   5, 183, 165,   4,  88, 379,  52, 129,
         4,  68,   5, 162,  18, 122,  25, 456, 299, 358, 141,  24, 368,
       264,   2,   5,  80, 202,  67,   4, 152,   5,  67,  95, 143,   2,
        91,   5, 159,  40,   5,  64, 132,  40,   4,  24, 106,  68, 411,
       121,  33,  40,  41,  57,  73, 107, 163,  65,   4,   5, 251,  24,
         4,  61, 164, 300,   2,   5,  24,  35,  20,  16,   5,   4,   4,
       124,  78,  95,  20,  16, 450,   1, 358,  40,  40,  97,  27,   5,
        71,  82, 104,  24, 282,  40,   5,   1,   1, 172,  12,  40,   4,
        28,   1,  68,   5,  24, 185, 221,  33,   5, 439, 184,  92,  54,
        68, 233, 297,  82, 282, 233,  33,   0,   5,   4, 132,  36,   1,
         5,  40,   5,   5,   4, 161,  40,  11,   5,  68,  22, 44

In [16]:
pure_data = (X_train, X_test, y_train, y_test)
with open('../data/processed/pure_data.pkl', 'wb') as data_file:
    pickle.dump(pure_data, data_file, pickle.HIGHEST_PROTOCOL)

In [9]:
tpot = TPOTClassifier(generations=generations, 
                      population_size=population_size,
                      verbosity=2, 
                      random_state=42, 
                      max_eval_time_mins=max_eval_time_mins, 
                      n_jobs=n_jobs)

tpot.fit(X_train, y_train)
score = tpot.score(X_test, y_test)
logging.debug(f'PURE MEDDRA: {score}')

Optimization Progress:   0%|          | 0/150 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.23549020665839313

Generation 2 - Current best internal CV score: 0.23549020665839313

Best pipeline: ExtraTreesClassifier(input_matrix, bootstrap=False, criterion=gini, max_features=0.9000000000000001, min_samples_leaf=1, min_samples_split=17, n_estimators=100)


# Pure data with NCA (metric learning)

In [10]:
tpot = TPOTClassifier(generations=generations, 
                      population_size=population_size,
                      verbosity=2, 
                      random_state=42, 
                      max_eval_time_mins=max_eval_time_mins, 
                      n_jobs=n_jobs)
clf = make_pipeline(NCA(max_iter=max_iter), tpot)

clf.fit(X_train, y_train)
clf.score(X_test, y_test)
score = tpot.score(X_test, y_test)
logging.debug(f'PURE MEDDRA with NCA: {score}')

Optimization Progress:   0%|          | 0/150 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.13793773269303405

Generation 2 - Current best internal CV score: 0.1388690444072735

Best pipeline: KNeighborsClassifier(input_matrix, n_neighbors=73, p=1, weights=distance)


# Pure vectorized data



In [7]:
df = pd.read_csv('../data/interim/meddra_data_simple_vec.csv')
meddra_labels = {v:k for k, v in enumerate(df['meddra'].unique())}
df['meddra_label'] = df['meddra'].apply(lambda x: int(meddra_labels[x]))
df = df.drop(columns=['text', 'meddra'])
train, test = train_test_split(df, test_size=0.20)


X_train = np.array(train[[col for col in df.columns if col != 'meddra_label']])
y_train = np.array(train['meddra_label'].to_list())

X_test = np.array(test[[col for col in df.columns if col != 'meddra_label']])
y_test = np.array(test['meddra_label'].to_list())

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((2153, 768), (2153,), (539, 768), (539,))

In [17]:
rich_data = (X_train, X_test, y_train, y_test)
with open('../data/processed/pure_data_vectorized.pkl', 'wb') as data_file:
    pickle.dump(rich_data, data_file, pickle.HIGHEST_PROTOCOL)

In [12]:
tpot = TPOTClassifier(generations=generations, 
                      population_size=population_size,
                      verbosity=2, 
                      random_state=42, 
                      max_eval_time_mins=max_eval_time_mins, 
                      n_jobs=n_jobs)

tpot.fit(X_train, y_train)
score = tpot.score(X_test, y_test)
logging.debug(f'PURE MEDDRA vectorized: {score}')

Optimization Progress:   0%|          | 0/150 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.2972557060378784

Generation 2 - Current best internal CV score: 0.2972557060378784

Best pipeline: KNeighborsClassifier(input_matrix, n_neighbors=3, p=1, weights=distance)


# Pure vec data with NCA

In [13]:
tpot = TPOTClassifier(generations=generations, 
                      population_size=population_size,
                      verbosity=2, 
                      random_state=42, 
                      max_eval_time_mins=max_eval_time_mins, 
                      n_jobs=n_jobs)

clf = make_pipeline(NCA(max_iter=max_iter), tpot)

clf.fit(X_train, y_train)
score = tpot.score(X_test, y_test)
logging.debug(f'PURE MEDDRA vectorized with NCA: {score}')

Optimization Progress:   0%|          | 0/150 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.38736200291372147

Generation 2 - Current best internal CV score: 0.3901505422759402

Best pipeline: KNeighborsClassifier(input_matrix, n_neighbors=2, p=2, weights=distance)


# Enriched train

In [14]:
df = pd.read_csv('../data/interim/rich_data.csv')
meddra_labels = {v:k for k, v in enumerate(df['meddra'].unique())}
df['meddra_label'] = df['meddra'].apply(lambda x: int(meddra_labels[x]))
train, test = train_test_split(df, test_size=0.20)

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(train['text'])

train['text_tokenized'] = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(train['text']), maxlen=7).tolist()
test['text_tokenized'] = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(test['text']), maxlen=7).tolist()
train['counts'] = train['meddra_label'].apply(lambda x: train[train['meddra_label']==x].shape[0])
#train = train[train['counts']>50]

X_train = np.array(train['text_tokenized'].to_list())
y_train = np.array(train['meddra_label'].to_list())

X_test = np.array(test['text_tokenized'].to_list())
y_test = np.array(test['meddra_label'].to_list())

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((5166, 7), (5166,), (1292, 7), (1292,))

In [15]:
tpot = TPOTClassifier(generations=generations,
                      population_size=population_size,
                      verbosity=2, 
                      random_state=42, 
                      max_eval_time_mins=max_eval_time_mins, 
                      n_jobs=n_jobs)
tpot.fit(X_train, y_train)
score = tpot.score(X_test, y_test)
logging.debug(f'ENRICHED MEDDRA: {score}')

Optimization Progress:   0%|          | 0/150 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.3186225918013111

Generation 2 - Current best internal CV score: 0.3778544023997259

Best pipeline: KNeighborsClassifier(SelectFromModel(input_matrix, criterion=entropy, max_features=0.2, n_estimators=100, threshold=0.30000000000000004), n_neighbors=87, p=2, weights=distance)


# Enrich data with NCA (metric learning)

In [16]:
tpot = TPOTClassifier(generations=generations, 
                      population_size=population_size,
                      verbosity=2, 
                      random_state=42, 
                      max_eval_time_mins=max_eval_time_mins, 
                      n_jobs=n_jobs)
clf = make_pipeline(NCA(max_iter=max_iter), tpot)

clf.fit(X_train, y_train)
score = tpot.score(X_test, y_test)
logging.debug(f'ENRICHED MEDDRA with NCA: {score}')

Optimization Progress:   0%|          | 0/150 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.19067147760274575

Generation 2 - Current best internal CV score: 0.19493054164224685

Best pipeline: KNeighborsClassifier(input_matrix, n_neighbors=47, p=2, weights=distance)


# Enrich data vec

In [17]:
df = pd.read_csv('../data/interim/meddra_data_rich_vec.csv')
meddra_labels = {v:k for k, v in enumerate(df['meddra'].unique())}
df['meddra_label'] = df['meddra'].apply(lambda x: int(meddra_labels[x]))
train, test = train_test_split(df, test_size=0.20)

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(train['text'])

train['text_tokenized'] = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(train['text']), maxlen=7).tolist()
test['text_tokenized'] = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(test['text']), maxlen=7).tolist()
train['counts'] = train['meddra_label'].apply(lambda x: train[train['meddra_label']==x].shape[0])
#train = train[train['counts']>50]

X_train = np.array(train['text_tokenized'].to_list())
y_train = np.array(train['meddra_label'].to_list())

X_test = np.array(test['text_tokenized'].to_list())
y_test = np.array(test['meddra_label'].to_list())

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((7320, 7), (7320,), (1830, 7), (1830,))

In [None]:
tpot = TPOTClassifier(generations=generations, 
                      population_size=population_size,
                      verbosity=2, 
                      random_state=42, 
                      max_eval_time_mins=max_eval_time_mins, 
                      n_jobs=n_jobs)
tpot.fit(X_train, y_train)
score = tpot.score(X_test, y_test)
logging.debug(f'ENRICHED MEDDRA vectorized: {score}')

Optimization Progress:   0%|          | 0/150 [00:00<?, ?pipeline/s]

# Enrich data vec with NCA

In [None]:
tpot = TPOTClassifier(generations=generations, 
                      population_size=population_size,
                      verbosity=2, 
                      random_state=42, 
                      max_eval_time_mins=max_eval_time_mins, 
                      n_jobs=n_jobs)
clf = make_pipeline(NCA(max_iter=max_iter), tpot)

clf.fit(X_train, y_train)
score = tpot.score(X_test, y_test)
logging.debug(f'ENRICHED MEDDRA vectorized with NCA: {score}')