In [14]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')

In [15]:
import pickle
import pandas as pd
import numpy as np
from copy import copy
import tensorflow as tf
from sklearn.model_selection import train_test_split

from sent2vec.vectorizer import Vectorizer

# Vectorization

In [17]:
simple_df = pd.read_csv('../data/interim/simple_data.csv')
rich_df = pd.read_csv('../data/interim/rich_data.csv')

simple_df.shape, rich_df.shape

((2692, 2), (6458, 2))

In [18]:
vectorizer = Vectorizer()

In [20]:
vectorizer.bert(simple_df['text'])
vectors = vectorizer.vectors
df_simple = pd.concat([simple_df, pd.DataFrame(vectors)], axis=1)
df_simple.shape

(2692, 770)

In [21]:
vectorizer.bert(rich_df['text'])
vectors = vectorizer.vectors
df_rich = pd.concat([rich_df, pd.DataFrame(vectors)], axis=1)
df_rich.shape

(6458, 770)

In [27]:
df_simple.to_csv('../data/interim/meddra_data_simple_vec.csv', index=False)
df_rich.to_csv('../data/interim/meddra_data_rich_vec.csv', index=False)

# Pure

In [28]:
df = pd.read_csv('../data/interim/meddra_data.csv')
meddra_labels = {v:k for k, v in enumerate(df['meddra'].unique())}
df['meddra_label'] = df['meddra'].apply(lambda x: int(meddra_labels[x]))
train, test = train_test_split(df, test_size=0.20)

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(train['text'])

train['text_tokenized'] = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(train['text']), maxlen=7).tolist()
test['text_tokenized'] = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(test['text']), maxlen=7).tolist()
train['counts'] = train['meddra_label'].apply(lambda x: train[train['meddra_label']==x].shape[0])

X_train = np.array(train['text_tokenized'].to_list())
y_train = np.array(train['meddra_label'].to_list())

X_test = np.array(test['text_tokenized'].to_list())
y_test = np.array(test['meddra_label'].to_list())

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((2153, 7), (2153,), (539, 7), (539,))

In [None]:
pure_data = (X_train, X_test, y_train, y_test)
with open('../data/processed/pure_data.pkl', 'wb') as data_file:
    pickle.dump(pure_data, data_file, pickle.HIGHEST_PROTOCOL)

# Pure vectorized

In [8]:
df = pd.read_csv('../data/interim/meddra_data_simple_vec.csv')
meddra_labels = {v:k for k, v in enumerate(df['meddra'].unique())}
df['meddra_label'] = df['meddra'].apply(lambda x: int(meddra_labels[x]))
df = df.drop(columns=['text', 'meddra'])
train, test = train_test_split(df, test_size=0.20)


X_train = np.array(train[[col for col in df.columns if col != 'meddra_label']])
y_train = np.array(train['meddra_label'].to_list())

X_test = np.array(test[[col for col in df.columns if col != 'meddra_label']])
y_test = np.array(test['meddra_label'].to_list())

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((2153, 768), (2153,), (539, 768), (539,))

In [9]:
rich_data = (X_train, X_test, y_train, y_test)
with open('../data/processed/pure_data_vectorized.pkl', 'wb') as data_file:
    pickle.dump(rich_data, data_file, pickle.HIGHEST_PROTOCOL)