<a href="https://colab.research.google.com/github/CYaiche/Machine_Learning/blob/master/projet5/embedding_topic_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Supervised topic modeling : NN approach



In [105]:
import numpy as np
import gensim.corpora as corpora
from  gensim.models import KeyedVectors
from gensim.models import Word2Vec
import warnings
warnings.filterwarnings('ignore')

from keras.utils import pad_sequences
from sklearn.metrics import jaccard_score, average_precision_score

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Embedding, Dense, Input, Embedding


In [89]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [90]:

X_title_train = np.load("/content/drive/MyDrive/OpenClassroom/X_title_train.npy"     , allow_pickle=True)
X_corpus_train = np.load("/content/drive/MyDrive/OpenClassroom/X_corpus_train.npy",   allow_pickle=True)
X_title_test = np.load("/content/drive/MyDrive/OpenClassroom/X_title_test.npy"     , allow_pickle=True)
X_corpus_test = np.load("/content/drive/MyDrive/OpenClassroom/X_corpus_test.npy",   allow_pickle=True)

y_train = np.load("/content/drive/MyDrive/OpenClassroom/y_train.npy"     , allow_pickle=True)
y_test = np.load("/content/drive/MyDrive/OpenClassroom/y_test.npy",   allow_pickle=True)

label_list = np.load("/content/drive/MyDrive/OpenClassroom/label_list.npy"     , allow_pickle=True)

X_train = [ np.append(X_title_train[i], X_corpus_train[i]) for i in range (len( X_title_train)) ]
X_test = [ np.append(X_title_test[i], X_corpus_test[i]) for i in range (len( X_title_test)) ]


In [91]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y_train_b = mlb.fit_transform(y_train)
y_test_b = mlb.transform(y_test)

# Neural Networks

Multilayers perceptron (MLP) for multi-label classification

loss funstion : binary cross-entropy loss function

activation function : ReLU in the hidden layers

adam version of stochastic gradient descent

# Embedding preprocessing

In [92]:
X_train_list = [ x.tolist() for x in X_train]

In [93]:
# Création et entraînement du modèle Word2Vec
w2v_size=300
w2v_window=5
w2v_min_count=1
w2v_epochs=100
maxlen = 24 # adapt to length of sentences

sentences = X_train_list

print("Build & train Word2Vec model ...")
w2v_model = Word2Vec(min_count=w2v_min_count, window=w2v_window,
                                                vector_size=w2v_size,
                                                seed=42,
                                                workers=1)

Build & train Word2Vec model ...


The reason for separating the trained vectors into KeyedVectors is that if you don’t need the full model state any more (don’t need to continue training), the state can discarded, resulting in a much smaller and faster object that can be mmapped for lightning fast loading and sharing the vectors in RAM between processes:

Gensim can also load word vectors in the “word2vec C format”, as a KeyedVectors instance:

In [94]:
try :
  model_vectors = KeyedVectors.load("/content/drive/MyDrive/OpenClassroom/model_vectors.wv", mmap='r')
except :
  w2v_model.build_vocab(sentences)
  w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=w2v_epochs)
  model_vectors = w2v_model.wv
  w2v_words = model_vectors.index_to_key
  print("Vocabulary size: %i" % len(w2v_words))
  print("Word2Vec trained")

  model_vectors.save("/content/drive/MyDrive/OpenClassroom/model_vectors.wv")

In [95]:
print(w2v_model)

Word2Vec<vocab=0, vector_size=300, alpha=0.025>


In [96]:
id2word = corpora.Dictionary(X_train)
d = dict(zip(label_list, range(0,len(label_list))))

In [110]:
id2word = corpora.Dictionary(X_train)
x_train_ids = [ id2word.doc2idx(tokens) for tokens in X_train]
x_test_ids = [ id2word.doc2idx(tokens) for tokens in X_test]

In [98]:
max_length = np.max([ len(x) for x in x_train_ids])
print(f"max_length : {max_length}")

max_length : 2282


# Embedding matrix

In [103]:
len(model_vectors[0])

300

In [104]:
word2vec_vocab_size = len(model_vectors.key_to_index)
w2vec_dim = len(model_vectors[0])
word2vec_embedding_matrix = np.zeros((word2vec_vocab_size, w2vec_dim))

for word in model_vectors.key_to_index :

  embedding_vector = model_vectors[word]
  if embedding_vector is not None :
    idx =  model_vectors.key_to_index[word]
    word2vec_embedding_matrix[idx] = embedding_vector

print("Embedding matrix: %s" % str(word2vec_embedding_matrix.shape))

Embedding matrix: (41340, 300)


# Apply padding

In [109]:
x_train_pad = pad_sequences(x_train_ids, maxlen=max_length, padding='pre')
x_test_pad = pad_sequences(x_test_ids, maxlen=max_length, padding='pre')

In [111]:
x_train_pad[0]

array([ 0,  0,  0, ..., 71, 74, 79], dtype=int32)

In [116]:
model = Sequential()
# model.add(Input(shape=(1,max_length),dtype='float64'))
model.add(Embedding( word2vec_vocab_size, w2vec_dim,weights=[word2vec_embedding_matrix],  input_length=max_length))
model.add(Flatten())
model.add(Dense(30,activation='sigmoid'))
print(model.summary())

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 2282, 300)         12402000  
                                                                 
 flatten_2 (Flatten)         (None, 684600)            0         
                                                                 
 dense_5 (Dense)             (None, 30)                20538030  
                                                                 
Total params: 32,940,030
Trainable params: 32,940,030
Non-trainable params: 0
_________________________________________________________________
None


In [117]:
model.compile(optimizer='adam', loss='binary_crossentropy',metrics=['accuracy'])

In [114]:
y_train

array([list([8]), list([27]), list([2, 7, 14]), ..., list([3, 9]),
       list([17]), list([0])], dtype=object)

In [119]:
model.fit(x_train_pad, y_train_b, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fb0e5b3b730>

In [120]:
y_pred = model.predict(x_test_pad)

InvalidArgumentError: ignored

In [None]:

y_pred_nn = (y_pred > 0.1).astype(np.float32)

In [None]:

precision_em      = average_precision_score(y_test_b, y_pred, average='micro')
jaccard_score_em = jaccard_score(y_test_b, y_pred, average='micro')

The vocab is the number of unique words in my train data.
The size is the dimension in output of my embedding.

In [None]:
print(model)

In [None]:
model = Word2Vec.load("word2vec.model")
model.wv.similarity('splint','tableview')

In [None]:
X_train_embedding = model.wv[X_train]
X_test_embedding = model.wv[X_test]

In [None]:
# Embedding preprocessing

In [None]:
word_model = Sequential()
word_model.add(Embedding(vocab_size, embed_size,
                         embeddings_initializer="glorot_uniform",
                         input_length=1))
word_model.add(Reshape((embed_size, )))

context_model = Sequential()
context_model.add(Embedding(vocab_size, embed_size,
                  embeddings_initializer="glorot_uniform",
                  input_length=1))
context_model.add(Reshape((embed_size,)))

model = Sequential()
model.add(Merge([word_model, context_model], mode="dot"))
model.add(Dense(1, kernel_initializer="glorot_uniform", activation="sigmoid"))
model.compile(loss="mean_squared_error", optimizer="rmsprop")