<a href="https://colab.research.google.com/github/CYaiche/Machine_Learning/blob/master/projet5/embedding_topic_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Supervised topic modeling : NN approach



In [130]:
try:
    from google.colab import drive
    drive.mount('/content/drive')
    output_dir = "/content/drive/MyDrive/OpenClassroom/"
    # !pip install bertopic
    IN_COLAB = True
except:
    IN_COLAB = False
    output_dir = "./output/"

In [133]:
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import gensim.corpora as corpora
from gensim.models import KeyedVectors
from gensim.models import Word2Vec

from sklearn.preprocessing import MultiLabelBinarizer
from keras.utils import pad_sequences
from sklearn.metrics import jaccard_score, average_precision_score

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Flatten,
    Embedding,
    Dense,
    Input,
    Embedding,
    GlobalAveragePooling1D,
)

# from sentence_transformers import SentenceTransformer
# from bertopic import BERTopic
# from bertopic.vectorizers import ClassTfidfTransformer
# from bertopic.dimensionality import BaseDimensionalityReduction
# from sklearn.linear_model import LogisticRegression

In [135]:
print("Python version used : ", sys.version[:7])
print("Tensorflow  : ", tf.version.VERSION)

Python version used :  3.8.10 
Tensorflow  :  2.12.0


In [161]:
X_corpus_train = np.load(f"{output_dir}X_corpus_train.npy", allow_pickle=True)
X_title_train = np.load(f"{output_dir}X_title_train.npy", allow_pickle=True)
X_body_train = np.load(f"{output_dir}X_body_train.npy", allow_pickle=True)

X_corpus_test = np.load(f"{output_dir}X_corpus_test.npy", allow_pickle=True)
X_title_test = np.load(f"{output_dir}X_title_test.npy", allow_pickle=True)
X_body_test = np.load(f"{output_dir}X_body_test.npy", allow_pickle=True)

y_train = np.load(f"{output_dir}y_train.npy", allow_pickle=True)
y_test = np.load(f"{output_dir}y_test.npy", allow_pickle=True)

label_list = np.load(f"{output_dir}/label_list.npy", allow_pickle=True)

X_train = [
    np.append(X_title_train[i], X_body_train[i]) for i in range(len(X_title_train))
]
X_test = [
    np.append(X_title_test[i], X_body_test[i]) for i in range(len(X_title_test))
]

In [4]:

mlb = MultiLabelBinarizer()
y_train_b = mlb.fit_transform(y_train)
y_test_b = mlb.transform(y_test)

# Neural Networks

Multilayers perceptron (MLP) for multi-label classification

loss funstion : binary cross-entropy loss function

activation function : ReLU in the hidden layers

adam version of stochastic gradient descent

# Embedding preprocessing

In [5]:
X_train_list = [ x.tolist() for x in X_train]

In [6]:
#  Word2Vec model creation
w2v_size=300
w2v_window=5
w2v_min_count=1
w2v_epochs=100
maxlen = 24 # adapt to length of sentences

sentences = X_train_list

print("Build & train Word2Vec model ...")
w2v_model = Word2Vec(min_count=w2v_min_count, window=w2v_window,
                                                vector_size=w2v_size,
                                                seed=42,
                                                workers=1)

Build & train Word2Vec model ...


The reason for separating the trained vectors into KeyedVectors is that if you don’t need the full model state any more (don’t need to continue training), the state can discarded, resulting in a much smaller and faster object that can be mmapped for lightning fast loading and sharing the vectors in RAM between processes:

Gensim can also load word vectors in the “word2vec C format”, as a KeyedVectors instance:

In [7]:
try:
    model_vectors = KeyedVectors.load(f"{output_dir}model_vectors.wv", mmap="r")
except:
    w2v_model.build_vocab(sentences)
    w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=w2v_epochs)
    model_vectors = w2v_model.wv
    w2v_words = model_vectors.index_to_key
    print("Vocabulary size: %i" % len(w2v_words))
    print("Word2Vec trained")

    model_vectors.save(f"{output_dir}model_vectors.wv")

Vocabulary size: 39350
Word2Vec trained


In [8]:
print(w2v_model)

Word2Vec<vocab=39350, vector_size=300, alpha=0.025>


In [9]:
id2word = corpora.Dictionary(X_train)
d = dict(zip(label_list, range(0,len(label_list))))

In [81]:
id2word = corpora.Dictionary(X_train)
x_train_ids = [ id2word.doc2idx(tokens) for tokens in X_train]

x_test_ids = [ id2word.doc2idx(tokens) for tokens in X_test  ]
# Filter out -1 from x_test_ids
filtered_x_test_ids = [[word_id for word_id in sentence if word_id != -1] for sentence in x_test_ids]



In [11]:
max_length = np.max([ len(x) for x in x_train_ids])
print(f"max_length : {max_length}")

max_length : 1271


# Embedding matrix

In [12]:
len(model_vectors[0])

300

In [13]:
word2vec_vocab_size = len(model_vectors.key_to_index)
w2vec_dim = len(model_vectors[0])
word2vec_embedding_matrix = np.zeros((word2vec_vocab_size, w2vec_dim))

for word in model_vectors.key_to_index:
    embedding_vector = model_vectors[word]
    if embedding_vector is not None:
        idx = model_vectors.key_to_index[word]
        word2vec_embedding_matrix[idx] = embedding_vector

print("Embedding matrix: %s" % str(word2vec_embedding_matrix.shape))

Embedding matrix: (39350, 300)


# Apply padding

In [221]:
x_train_pad = pad_sequences(x_train_ids, maxlen=max_length, padding='pre',value=0)
x_test_pad = pad_sequences(filtered_x_test_ids, maxlen=max_length, padding='pre',value=0)


In [222]:
model = Sequential()
# model.add(Input(shape=(1,max_length),dtype='float64'))
model.add(Embedding( word2vec_vocab_size, w2vec_dim,weights=[word2vec_embedding_matrix],  input_length=max_length))
model.add(GlobalAveragePooling1D())
model.add(Dense(30,activation='sigmoid'))
print(model.summary())

Model: "sequential_28"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 1271, 300)         11805000  
                                                                 
 global_average_pooling1d_15  (None, 300)              0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dense_28 (Dense)            (None, 30)                9030      
                                                                 
Total params: 11,814,030
Trainable params: 11,814,030
Non-trainable params: 0
_________________________________________________________________
None


In [223]:
model.compile(optimizer='adam', loss='binary_crossentropy',metrics=['accuracy'])

In [224]:
model.fit(x_train_pad, y_train_b, epochs=5)

Epoch 1/5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1a526d0deb0>

In [228]:
y_pred = model.predict(x_test_pad)
y_pred_nn = (y_pred > 0.01).astype(np.float32)



In [229]:

precision_w2v      = average_precision_score(y_test_b, y_pred_nn, average='micro')
jaccard_score_w2v = jaccard_score(y_test_b, y_pred_nn, average='micro')

The vocab is the number of unique words in my train data.
The size is the dimension in output of my embedding.

# USE : Universal Sentence Encoder 
encodes into high-dimensional vectors, trained on variety of data and output 512 dimensional vector

# apply padding to words 

In [122]:
import tensorflow_hub as hub

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [201]:
X_train_use_embedding = embed(X_corpus_train)
X_test_use_embedding = embed(X_corpus_test)

In [213]:
use_model = Sequential()
use_model.add(Dense(256, activation='relu', input_dim=512))
use_model.add(Dense(128, activation='relu'))
use_model.add(Dense(30,activation='sigmoid'))
# Compile the model
use_model.compile(optimizer='adam', loss='binary_crossentropy',metrics=['accuracy'])
use_model.summary()

Model: "sequential_27"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_25 (Dense)            (None, 256)               131328    
                                                                 
 dense_26 (Dense)            (None, 128)               32896     
                                                                 
 dense_27 (Dense)            (None, 30)                3870      
                                                                 
Total params: 168,094
Trainable params: 168,094
Non-trainable params: 0
_________________________________________________________________


In [214]:
use_model.fit(X_train_use_embedding, y_train_b, epochs=5)

Epoch 1/5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1a5255af760>

In [219]:
y_pred_use = use_model.predict(X_test_use_embedding)
y_pred_use = (y_pred_use > 0.01).astype(np.float32)

 1/13 [=>............................] - ETA: 0s



In [220]:

precision_use      = average_precision_score(y_test_b, y_pred_use, average='micro')
jaccard_score_use = jaccard_score(y_test_b, y_pred_use, average='micro')

# BERT 

In [235]:
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.dimensionality import BaseDimensionalityReduction
from sklearn.linear_model import LogisticRegression

# Get labeled data
data = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))
docs = data['data']
y = data['target']

# Skip over dimensionality reduction, replace cluster model with classifier,
# and reduce frequent words while we are at it.
empty_dimensionality_model = BaseDimensionalityReduction()
clf = LogisticRegression()
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

# Create a fully supervised BERTopic instance
topic_model= BERTopic(
        umap_model=empty_dimensionality_model,
        hdbscan_model=clf,
        ctfidf_model=ctfidf_model
)
topics, probs = topic_model.fit_transform(docs, y=y)

ModuleNotFoundError: No module named 'bertopic'

In [98]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
embeddings_corpus = model.encode(X_body_train, show_progress_bar=True)
embeddings_titles = model.encode(X_title_train, show_progress_bar=True)




# Skip over dimensionality reduction, replace cluster model with classifier,
# and reduce frequent words while we are at it.
empty_dimensionality_model = BaseDimensionalityReduction()
clf = LogisticRegression()
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

# Create a fully supervised BERTopic instance
topic_model= BERTopic(
        umap_model=empty_dimensionality_model,
        hdbscan_model=clf,
        ctfidf_model=ctfidf_model
)
topics, probs = topic_model.fit_transform(docs, y=y)

NameError: name 'SentenceTransformer' is not defined

In [None]:
print(model)

In [None]:
model = Word2Vec.load("word2vec.model")
model.wv.similarity('splint','tableview')

In [None]:
X_train_embedding = model.wv[X_train]
X_test_embedding = model.wv[X_test]

In [None]:
# Embedding preprocessing

In [None]:
word_model = Sequential()
word_model.add(Embedding(vocab_size, embed_size,
                         embeddings_initializer="glorot_uniform",
                         input_length=1))
word_model.add(Reshape((embed_size, )))

context_model = Sequential()
context_model.add(Embedding(vocab_size, embed_size,
                  embeddings_initializer="glorot_uniform",
                  input_length=1))
context_model.add(Reshape((embed_size,)))

model = Sequential()
model.add(Merge([word_model, context_model], mode="dot"))
model.add(Dense(1, kernel_initializer="glorot_uniform", activation="sigmoid"))
model.compile(loss="mean_squared_error", optimizer="rmsprop")

# Evaluation 

In [233]:
avg_precision_scores = [precision_w2v, precision_use,]
jaccard_scores = [jaccard_score_w2v, jaccard_score_use]
result = pd.DataFrame(
    {
        'average_precision_scores' : avg_precision_scores,
        'jaccard_scores' : jaccard_scores
    }, index = ['Word2Vec','USE']
)
result.to_csv(f"{output_dir}result_supervised2.csv")

In [234]:
result

Unnamed: 0,average_precision_scores,jaccard_scores
Word2Vec,0.0485,0.0485
USE,0.09077,0.091183
