In [1]:
import pandas as pd
import numpy as np
import time

from keras import layers
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.preprocessing.text import Tokenizer, one_hot
from keras.preprocessing.sequence import pad_sequences

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import matplotlib.pyplot as plt

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [15]:
import fasttext
fname = "/media/data/datasets/biomed/EMBEDDINGS/Otzovik_200_fromScratch.bin"
w2v_model = fasttext.load_model(fname)
w2v_model.get_word_vector(fname)
embed_dim=200




In [63]:
path = '/home/tlenusik/9515_tweets_w_labels.csv'
df = pd.read_csv(path)
df.dropna(subset=["final"], inplace=True)

df_0 = df[df['final']==0.0]
df_1 = df[df['final']==1.0]

df = pd.concat([df_0[:4000], df_1], axis=0, sort=False)

df = df[['text','final']]
X = df['text'].fillna('').tolist()
X = [str(i) for i in X]
y = df['final'].tolist()
print(set(y))
print(df.shape)
print(df.groupby(['final']).size())
y = [float(i) for i in y]

{0.0, 1.0}
(4832, 2)
final
0.0    4000
1.0     832
dtype: int64


In [64]:
RANDOM_STATE = 42

# Split train & test
text_train, text_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

# Tokenize and transform to integer index
MAX_NB_WORDS = 100000
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True, char_level=False)
tokenizer.fit_on_texts(text_train)

X_train = tokenizer.texts_to_sequences(text_train)
X_test = tokenizer.texts_to_sequences(text_test)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index
maxlen = max(len(x) for x in X_train) # longest text in train set

# Add pading to ensure all vectors have same dimensionality
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

word_index = tokenizer.word_index

In [65]:
print('preparing embedding matrix...')
words_not_found = []
nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_dim))
for word, i in word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = w2v_model.get_word_vector(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)
print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

preparing embedding matrix...
number of null word embeddings: 1


In [66]:
# Define CNN architecture

model = Sequential()
model.add(layers.Embedding(nb_words, embed_dim,
          weights=[embedding_matrix], input_length=maxlen, trainable=True))
#model.add(layers.Embedding(vocab_size, embed_dim, input_length=maxlen))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
print(model.summary())

# Fit model
history = model.fit(X_train, y_train,
                    epochs=10,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=32)
loss, accuracy = model.evaluate(X_train, y_train, verbose=True)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 65, 200)           3744800   
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 61, 128)           128128    
_________________________________________________________________
global_max_pooling1d_8 (Glob (None, 128)               0         
_________________________________________________________________
dense_15 (Dense)             (None, 10)                1290      
_________________________________________________________________
dense_16 (Dense)             (None, 1)                 11        
Total params: 3,874,229
Trainable params: 3,874,229
Non-trainable params: 0
_________________________________________________________________
None
Train on 3865 samples, validate on 967 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epo

In [67]:
y_sample = model.predict_classes(X_test).flatten().tolist()
#print('Prediction: ', y_sample)

In [68]:
from sklearn import metrics
confusion_matrix = metrics.confusion_matrix(y_test, y_sample).tolist()
p1w, r1w, f1w, _ = metrics.precision_recall_fscore_support(y_test, y_sample, average='weighted')
p1m, r1m, f1m, _ = metrics.precision_recall_fscore_support(y_test, y_sample, average='macro')

In [69]:
metrics.confusion_matrix(y_test, y_sample)

array([[755,  36],
       [124,  52]])

In [70]:
print(metrics.classification_report(y_test, y_sample))

              precision    recall  f1-score   support

         0.0       0.86      0.95      0.90       791
         1.0       0.59      0.30      0.39       176

    accuracy                           0.83       967
   macro avg       0.72      0.62      0.65       967
weighted avg       0.81      0.83      0.81       967



In [62]:
print(metrics.classification_report(y_test, y_sample))

              precision    recall  f1-score   support

         0.0       0.84      0.79      0.81       402
         1.0       0.55      0.64      0.60       165

    accuracy                           0.75       567
   macro avg       0.70      0.72      0.71       567
weighted avg       0.76      0.75      0.75       567



In [None]:
#train fasttext 

In [1]:
import json
def load_docs(fname):
    texts = []
    with open(fname, "r", encoding='utf-8') as fin:
        lines = [line for line in fin]
    for line in lines:
        try:
            doc = json.loads(line)
            texts.append(doc['description'])
        except:
            print("error")
            pass
    return texts
FILE = "/media/data/datasets/biomed/otzovik/corpora/otzovik/all_reviews_texts.txt"
texts = load_docs(FILE)
with open("/media/data/datasets/biomed/otzovik/corpora/otzovik/data_temp.txt", "w+", encoding='utf-8') as fout:
    for line in texts:
        fout.write("{}\n".format(line.strip()))

In [2]:
import fasttext
d = 200
model = fasttext.train_unsupervised('/media/data/datasets/biomed/otzovik/corpora/otzovik/data_temp.txt', model='cbow', dim=d)

In [3]:
model.save_model("/media/data/datasets/biomed/EMBEDDINGS/Otzovik_{}_fromScratch.bin".format(d))

In [4]:
!ls /media/data/datasets/biomed/EMBEDDINGS/

BERT
MedAll_300_fromScratch.bin
Otzovik_200_fromScratch.bin
PubMedVec.bin
idf_genes_phc.txt
medline_abstract_word2vec_2012.bin
medline_abstract_word2vec_2012.txt
medline_abstract_word2vec_2012_mapped.txt
medline_abstract_word2vec_2013
medline_abstract_word2vec_2013.bin
medline_abstract_word2vec_2013.syn0.npy
medline_abstract_word2vec_2013.syn1neg.npy
medline_abstract_word2vec_2013.txt
medline_abstract_word2vec_2013_mapped.txt
medline_abstract_word2vec_2013_modified.bin
medline_abstract_word2vec_2014
medline_abstract_word2vec_2014.bin
medline_abstract_word2vec_2014.syn0.npy
medline_abstract_word2vec_2014.syn1neg.npy
medline_abstract_word2vec_2014.txt
medline_abstract_word2vec_2014_mapped.txt
medline_abstract_word2vec_2014_modified.bin
mesh_single_terms.txt
