In [3]:
import pandas as pd
import numpy as np
from copy import copy
import tensorflow as tf
from sklearn.model_selection import train_test_split

from metric_learn import NCA
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier

from tpot import TPOTClassifier

import warnings
warnings.filterwarnings("ignore")

In [4]:
generations = 2
population_size = 100
max_eval_time_mins = 2
n_jobs = 10
max_iter = 500

In [5]:
import pymedtermino

In [6]:
df = pd.read_csv('../data/interim/meddra_data.csv')

In [7]:
df_huge = pd.read_csv('../data/interim/terms_and_codes.csv')
df_huge = df_huge[['term', 'code']].rename(columns={'term': 'text', 'code': 'meddra'})
df_huge['meddra'] = df_huge['meddra'].apply(lambda x: int(x) if x.isdigit() else None)
df_huge = df_huge[df_huge['meddra'].isin(df['meddra'])]

In [8]:
df.to_csv('../data/interim/simple_data.csv', index=False)
df_huge.to_csv('../data/interim/rich_data.csv', index=False)

# Pure meddra

In [9]:
df = pd.read_csv('../data/interim/meddra_data.csv')
meddra_labels = {v:k for k, v in enumerate(df['meddra'].unique())}
df['meddra_label'] = df['meddra'].apply(lambda x: int(meddra_labels[x]))
train, test = train_test_split(df, test_size=0.20)

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(train['text'])

train['text_tokenized'] = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(train['text']), maxlen=7).tolist()
test['text_tokenized'] = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(test['text']), maxlen=7).tolist()
train['counts'] = train['meddra_label'].apply(lambda x: train[train['meddra_label']==x].shape[0])
#train = train[train['counts']>50]

X_train = np.array(train['text_tokenized'].to_list())
y_train = np.array(train['meddra_label'].to_list())

X_test = np.array(test['text_tokenized'].to_list())
y_test = np.array(test['meddra_label'].to_list())

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((2153, 7), (2153,), (539, 7), (539,))

In [None]:
tpot = TPOTClassifier(generations=generations, 
                      population_size=population_size,
                      verbosity=2, 
                      random_state=42, 
                      max_eval_time_mins=max_eval_time_mins, 
                      n_jobs=n_jobs)

tpot.fit(X_train, y_train)
tpot.score(X_test, y_test)

Optimization Progress:   0%|          | 0/300 [00:00<?, ?pipeline/s]

# Pure data with NCA (metric learning)

In [None]:
tpot = TPOTClassifier(generations=generations, 
                      population_size=population_size,
                      verbosity=2, 
                      random_state=42, 
                      max_eval_time_mins=max_eval_time_mins, 
                      n_jobs=n_jobs)
clf = make_pipeline(NCA(max_iter=max_iter), tpot)

clf.fit(X_train, y_train)
clf.score(X_test, y_test)

# Pure vectorized data



In [None]:
df = pd.read_csv('../data/interim/meddra_data_simple_vec.csv')
meddra_labels = {v:k for k, v in enumerate(df['meddra'].unique())}
df['meddra_label'] = df['meddra'].apply(lambda x: int(meddra_labels[x]))
df = df.drop(columns=['text', 'meddra'])
train, test = train_test_split(df, test_size=0.20)


X_train = np.array(train[[col for col in df.columns if col != 'meddra_label']])
y_train = np.array(train['meddra_label'].to_list())

X_test = np.array(test[[col for col in df.columns if col != 'meddra_label']])
y_test = np.array(test['meddra_label'].to_list())

X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
tpot = TPOTClassifier(generations=generations, 
                      population_size=population_size,
                      verbosity=2, 
                      random_state=42, 
                      max_eval_time_mins=max_eval_time_mins, 
                      n_jobs=n_jobs)

tpot.fit(X_train, y_train)
tpot.score(X_test, y_test)

# Pure vec data with NCA

In [None]:
tpot = TPOTClassifier(generations=generations, 
                      population_size=population_size,
                      verbosity=2, 
                      random_state=42, 
                      max_eval_time_mins=max_eval_time_mins, 
                      n_jobs=n_jobs)

clf = make_pipeline(NCA(max_iter=max_iter), tpot)

clf.fit(X_train, y_train)
clf.score(X_test, y_test)

# Enriched train

In [None]:
df = pd.read_csv('../data/interim/rich_data.csv')
meddra_labels = {v:k for k, v in enumerate(df['meddra'].unique())}
df['meddra_label'] = df['meddra'].apply(lambda x: int(meddra_labels[x]))
train, test = train_test_split(df, test_size=0.20)

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(train['text'])

train['text_tokenized'] = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(train['text']), maxlen=7).tolist()
test['text_tokenized'] = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(test['text']), maxlen=7).tolist()
train['counts'] = train['meddra_label'].apply(lambda x: train[train['meddra_label']==x].shape[0])
#train = train[train['counts']>50]

X_train = np.array(train['text_tokenized'].to_list())
y_train = np.array(train['meddra_label'].to_list())

X_test = np.array(test['text_tokenized'].to_list())
y_test = np.array(test['meddra_label'].to_list())

X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
tpot = TPOTClassifier(generations=generations,
                      population_size=population_size,
                      verbosity=2, 
                      random_state=42, 
                      max_eval_time_mins=max_eval_time_mins, 
                      n_jobs=n_jobs)
tpot.fit(X_train, y_train)
tpot.score(X_test, y_test)

# Enrich data with NCA (metric learning)

In [None]:
tpot = TPOTClassifier(generations=generations, 
                      population_size=population_size,
                      verbosity=2, 
                      random_state=42, 
                      max_eval_time_mins=max_eval_time_mins, 
                      n_jobs=n_jobs)
clf = make_pipeline(NCA(max_iter=max_iter), tpot)

clf.fit(X_train, y_train)
clf.score(X_test, y_test)

# Enrich data vec

In [None]:
df = pd.read_csv('../data/interim/meddra_data_rich_vec.csv')
meddra_labels = {v:k for k, v in enumerate(df['meddra'].unique())}
df['meddra_label'] = df['meddra'].apply(lambda x: int(meddra_labels[x]))
train, test = train_test_split(df, test_size=0.20)

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(train['text'])

train['text_tokenized'] = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(train['text']), maxlen=7).tolist()
test['text_tokenized'] = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(test['text']), maxlen=7).tolist()
train['counts'] = train['meddra_label'].apply(lambda x: train[train['meddra_label']==x].shape[0])
#train = train[train['counts']>50]

X_train = np.array(train['text_tokenized'].to_list())
y_train = np.array(train['meddra_label'].to_list())

X_test = np.array(test['text_tokenized'].to_list())
y_test = np.array(test['meddra_label'].to_list())

X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
tpot = TPOTClassifier(generations=generations, 
                      population_size=population_size,
                      verbosity=2, 
                      random_state=42, 
                      max_eval_time_mins=max_eval_time_mins, 
                      n_jobs=n_jobs)
tpot.fit(X_train, y_train)
tpot.score(X_test, y_test)

# Enrich data vec with NCA

In [None]:
tpot = TPOTClassifier(generations=generations, 
                      population_size=population_size,
                      verbosity=2, 
                      random_state=42, 
                      max_eval_time_mins=max_eval_time_mins, 
                      n_jobs=n_jobs)
clf = make_pipeline(NCA(max_iter=max_iter), tpot)

clf.fit(X_train, y_train)
clf.score(X_test, y_test)