In [46]:
import pandas as pd
import numpy as np
from copy import copy
import tensorflow as tf
from sklearn.model_selection import train_test_split

from metric_learn import NCA
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier

import warnings
warnings.filterwarnings("ignore")

In [47]:
import pymedtermino

In [48]:
df = pd.read_csv('../data/interim/meddra_data.csv')
df

Unnamed: 0,text,meddra
0,can't go out in the sun,10034972
1,waking up after the longest dream,10041349
2,eventful movie night in your dreams,10000125
3,may NOT switch your brain off,10064805
4,sleep for the next 2 days,10020765
...,...,...
2687,flaring,10010264
2688,flares,10010264
2689,flare,10010264
2690,fistulas,10016717


In [49]:
df_huge = pd.read_csv('../data/interim/terms_and_codes.csv')
df_huge = df_huge[['term', 'code']].rename(columns={'term': 'text', 'code': 'meddra'})
df_huge['meddra'] = df_huge['meddra'].apply(lambda x: int(x) if x.isdigit() else None)
df_huge = df_huge[df_huge['meddra'].isin(df['meddra'])]
df_huge

Unnamed: 0,text,meddra
10,Abdominal distension,10000060.0
22,Abdominal pain,10000081.0
81,Abscess,10000269.0
219,Adhesion,10059837.0
290,Aggression,10001488.0
...,...,...
94785,extreme pain in back,10033371.0
94786,"excruciating, unusual abdominal pain",10033371.0
94787,severe pain close to my the crotch area,10033371.0
94791,Severe leg muscle pain,10033371.0


In [50]:
meddra_labels = {v:k for k, v in enumerate(df['meddra'].unique())}
df['meddra_label'] = df['meddra'].apply(lambda x: int(meddra_labels[x]))

df = pd.concat([df]*1, axis=0)
df

Unnamed: 0,text,meddra,meddra_label
0,can't go out in the sun,10034972,0
1,waking up after the longest dream,10041349,1
2,eventful movie night in your dreams,10000125,2
3,may NOT switch your brain off,10064805,3
4,sleep for the next 2 days,10020765,4
...,...,...,...
2687,flaring,10010264,307
2688,flares,10010264,307
2689,flare,10010264,307
2690,fistulas,10016717,470


In [51]:
train, test = train_test_split(df, test_size=0.20)
train.shape, test.shape

((2153, 3), (539, 3))

# Pure meddra

In [52]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(train['text'])

train['text_tokenized'] = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(train['text']), maxlen=7).tolist()
test['text_tokenized'] = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(test['text']), maxlen=7).tolist()

In [53]:
train['counts'] = train['meddra_label'].apply(lambda x: train[train['meddra_label']==x].shape[0])
train = train[train['counts']>50]
train

Unnamed: 0,text,meddra,meddra_label,text_tokenized,counts
1989,intense dreams,10000125,2,"[0, 0, 0, 0, 0, 235, 16]",59
634,didnt really sleep last,10022437,5,"[0, 0, 0, 410, 83, 2, 194]",256
977,contribute the sleep,10041349,1,"[0, 0, 0, 0, 671, 13, 2]",71
857,lose their appetites,10061428,37,"[0, 0, 0, 0, 84, 411, 306]",51
694,ca n't fucking sleep,10022437,5,"[0, 0, 0, 94, 17, 42, 2]",256
...,...,...,...,...,...
205,sleep for 12 + hours,10020765,4,"[0, 0, 0, 2, 15, 74, 18]",111
597,knocked on my ass,10020765,4,"[0, 0, 0, 178, 30, 3, 324]",111
1266,sleep for dayyssss,10020765,4,"[0, 0, 0, 0, 2, 15, 1734]",111
259,dreams so sick and twisted,10000125,2,"[0, 0, 16, 26, 143, 64, 1736]",59


In [54]:
X_train = np.array(train['text_tokenized'].to_list())
y_train = np.array(train['meddra_label'].to_list())

X_test = np.array(test['text_tokenized'].to_list())
y_test = np.array(test['meddra_label'].to_list())

In [55]:
X_train.shape, y_train.shape

((687, 7), (687,))

In [12]:
from tpot import TPOTClassifier

tpot = TPOTClassifier(generations=2, verbosity=2, random_state=42, max_eval_time_mins=2)

tpot.fit(X_train, y_train)

Optimization Progress:   0%|          | 0/300 [00:00<?, ?pipeline/s]

Generation 1 - Current best internal CV score: 0.617910447761194
Generation 2 - Current best internal CV score: 0.617910447761194

Best pipeline: GradientBoostingClassifier(input_matrix, learning_rate=0.1, max_depth=3, max_features=1.0, min_samples_leaf=10, min_samples_split=4, n_estimators=100, subsample=0.7500000000000001)


TPOTClassifier(generations=2, max_eval_time_mins=2, random_state=42,
               verbosity=2)

In [13]:
tpot.score(X_test, y_test)

0.19851576994434136

# Pure data with NCA (metric learning)

In [80]:
clf = make_pipeline(NCA(max_iter=100), tpot)
#cross_val_score(clf, X_train, y_train).mean()

clf.fit(X_train, y_train)
clf.score(X_test, y_test)

Optimization Progress:   0%|          | 0/300 [00:00<?, ?pipeline/s]



TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: RandomForestClassifier(PCA(input_matrix, iterated_power=8, svd_solver=randomized), bootstrap=True, criterion=entropy, max_features=0.55, min_samples_leaf=7, min_samples_split=11, n_estimators=100)


0.10575139146567718

# Enriched train

In [81]:
enriched = pd.concat([train, df_huge], axis=0)
enriched['meddra_label'] = enriched['meddra'].apply(lambda x: int(meddra_labels[x]))
enriched

Unnamed: 0,counts,meddra,meddra_label,text,text_tokenized
1989,59.0,10000125.0,2,intense dreams,"[0, 0, 0, 0, 0, 235, 16]"
634,256.0,10022437.0,5,didnt really sleep last,"[0, 0, 0, 410, 83, 2, 194]"
977,71.0,10041349.0,1,contribute the sleep,"[0, 0, 0, 0, 671, 13, 2]"
857,51.0,10061428.0,37,lose their appetites,"[0, 0, 0, 0, 84, 411, 306]"
694,256.0,10022437.0,5,ca n't fucking sleep,"[0, 0, 0, 94, 17, 42, 2]"
...,...,...,...,...,...
94785,,10033371.0,68,extreme pain in back,
94786,,10033371.0,68,"excruciating, unusual abdominal pain",
94787,,10033371.0,68,severe pain close to my the crotch area,
94791,,10033371.0,68,Severe leg muscle pain,


In [82]:
train = copy(enriched)
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(train['text'])

train['text_tokenized'] = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(train['text']), maxlen=7).tolist()
test['text_tokenized'] = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(test['text']), maxlen=7).tolist()

In [83]:
train['counts'] = train['meddra_label'].apply(lambda x: train[train['meddra_label']==x].shape[0])
train = train[train['counts']>50]
train

Unnamed: 0,counts,meddra,meddra_label,text,text_tokenized
1989,190,10000125.0,2,intense dreams,"[0, 0, 0, 0, 0, 102, 20]"
634,816,10022437.0,5,didnt really sleep last,"[0, 0, 0, 630, 114, 2, 214]"
977,210,10041349.0,1,contribute the sleep,"[0, 0, 0, 0, 1153, 9, 2]"
857,173,10061428.0,37,lose their appetites,"[0, 0, 0, 0, 125, 553, 554]"
694,816,10022437.0,5,ca n't fucking sleep,"[0, 0, 0, 132, 17, 90, 2]"
...,...,...,...,...,...
94784,251,10033371.0,68,extreme pain in legs,"[0, 0, 0, 41, 7, 5, 86]"
94785,251,10033371.0,68,extreme pain in back,"[0, 0, 0, 41, 7, 5, 67]"
94786,251,10033371.0,68,"excruciating, unusual abdominal pain","[0, 0, 0, 490, 3418, 199, 7]"
94787,251,10033371.0,68,severe pain close to my the crotch area,"[7, 621, 3, 4, 9, 1611, 743]"


In [84]:
X_train = np.array(train['text_tokenized'].to_list())
y_train = np.array(train['meddra_label'].to_list())

X_test = np.array(test['text_tokenized'].to_list())
y_test = np.array(test['meddra_label'].to_list())

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((3496, 7), (3496,), (539, 7), (539,))

In [19]:
from tpot import TPOTClassifier

tpot = TPOTClassifier(generations=2, verbosity=2, random_state=42, max_eval_time_mins=2)

tpot.fit(X_train, y_train)

Optimization Progress:   0%|          | 0/300 [00:00<?, ?pipeline/s]

Generation 1 - Current best internal CV score: 0.7067993880757462


TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: RandomForestClassifier(XGBClassifier(input_matrix, learning_rate=0.5, max_depth=4, min_child_weight=1, n_estimators=100, nthread=1, subsample=0.4), bootstrap=False, criterion=entropy, max_features=0.8, min_samples_leaf=9, min_samples_split=10, n_estimators=100)


TPOTClassifier(generations=2, max_eval_time_mins=2, random_state=42,
               verbosity=2)

In [20]:
tpot.score(X_test, y_test)

0.43413729128014844

# Enrich data with NCA (metric learning)

In [85]:
clf = make_pipeline(NCA(max_iter=100), tpot)
#cross_val_score(clf, X_train, y_train).mean()

clf.fit(X_train, y_train)
clf.score(X_test, y_test)

Optimization Progress:   0%|          | 0/300 [00:00<?, ?pipeline/s]



TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: KNeighborsClassifier(input_matrix, n_neighbors=7, p=2, weights=distance)


0.4267161410018553