<a href="https://colab.research.google.com/github/AliAkbarBadri/topics-for-types/blob/main/topics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
! pip -q install laserembeddings
! python -m laserembeddings download-models
! pip -q install transformers

In [None]:
import pandas as pd
import numpy as np
import numpy as np
import matplotlib.pyplot as plt
import os
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoConfig, AutoTokenizer, AutoModel
from laserembeddings import Laser
from __future__ import unicode_literals

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Word2Vec

In [None]:
df_topics_lstm = pd.read_csv("/content/drive/My Drive/topics-for-types/df_topics.csv")
df_topics_lstm.head()

In [None]:
X = df_topics_lstm[df_topics_lstm.columns[0]].values
Y = df_topics_lstm[df_topics_lstm.columns[1:]].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.10)

In [None]:
tokenizer = Tokenizer(num_words=50000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
vocab_size = len(tokenizer.word_index) + 1

maxlen = 200

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [None]:
vocab_size, X_train.shape, y_train.shape, X_test.shape, y_test.shape

(8612, (2181, 200), (2181, 12), (243, 200), (243, 12))

In [None]:
embeddings_dictionary = dict()

glove_file = open('drive/My Drive/nlp/similarity/embeddings/twitt_wiki_ham_blog.fa.text.100.vec', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

embedding_matrix = np.zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(maxlen,)),
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim = 100, 
                            weights=[embedding_matrix],
                            trainable=False, name='Embedding_Layer'),
    tf.keras.layers.LSTM(200,
                        # return_sequences=True,
                        # stateful=True,
                        recurrent_initializer='glorot_uniform',
                        name='LSTM_Layer'),
    tf.keras.layers.Dense(12, activation='sigmoid', name='Dense_Layer')
  ], name='LSTM_Model')
model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), metrics=['acc'])
print(model.summary())
print()
print(model.layers[0].weights)

In [None]:
history = model.fit(X_train, y_train, batch_size=32, epochs=15, verbose=1, validation_split=0.2)

In [None]:
score = model.evaluate(X_test, y_test, verbose=1)

print("Test Score:", score[0])
print("Test Accuracy:", score[1])

# LASER

In [None]:
laser = Laser()

In [None]:
df_topics_laser = pd.read_csv("/content/drive/My Drive/topics-for-types/df_topics.csv")
df_topics_laser.head()

In [None]:
df_topics_laser['sentence'] = df_topics_laser['sentence'].apply(lambda sent: laser.embed_sentences(sent, lang='fa')[0])
df_topics_laser.head()

In [None]:
X = np.array(df_topics_laser[df_topics_laser.columns[0]])
X = np.vstack(X)

Y = df_topics_laser[df_topics_laser.columns[1:]].values

X.shape, Y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.10)

In [None]:
len(X_train),len(X_test),len(y_train),len(y_test)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=X[0].shape, name='input_layer'),
    tf.keras.layers.Dense(100, activation='sigmoid', name='Dense_Layer'),
    tf.keras.layers.Dense(12, activation='sigmoid', name='Dense_Layer2'),
  ], name='LASER_Model')

model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])
print(model.summary())

In [None]:
history = model.fit(X_train, y_train, batch_size=8, epochs=30, verbose=1, validation_split=0.2)

In [None]:
score = model.evaluate(X_test, y_test, verbose=1)

print("Test Score:", score[0])
print("Test Accuracy:", score[1])

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])

plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

# BERT

In [None]:
config = AutoConfig.from_pretrained("m3hrdadfi/albert-fa-base-v2")
tokenizer = AutoTokenizer.from_pretrained("m3hrdadfi/albert-fa-base-v2")
bert_model = AutoModel.from_pretrained("m3hrdadfi/albert-fa-base-v2")

In [None]:
def get_embedding(sent):
    input_sentence = torch.tensor(tokenizer.encode(sent, max_length = 500)).unsqueeze(0)
    out = bert_model(input_sentence)
    embeddings_of_last_layer = out[0]
    cls_embeddings = embeddings_of_last_layer[0][0]
    cls_embeddings = cls_embeddings.detach().numpy()
    return cls_embeddings

In [None]:
# df_topics_bert = pd.read_csv("/content/drive/My Drive/topics-for-types/df_topics.csv")

In [None]:
# df_topics_bert['sentence'] = df_topics_bert['sentence'].apply(lambda sent: get_embedding(sent))
# df_topics_bert.to_csv("/content/drive/My Drive/topics-for-types/df_topics_bert.csv", index=False)
# df_topics_bert.head()

In [None]:
df_topics_bert = pd.read_csv("/content/drive/My Drive/topics-for-types/df_topics_bert.csv")
df_topics_bert['sentence'] = df_topics_bert['sentence'].apply(lambda x: 
                           np.fromstring(
                               x.replace('\n','')
                                .replace('[','')
                                .replace(']','')
                                .replace('  ',' '), sep=' '))
df_topics_bert.head()

In [None]:
X = np.array(df_topics_bert[df_topics_bert.columns[0]])
X = np.vstack(X)

Y = df_topics_bert[df_topics_bert.columns[1:]].values

X.shape, Y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20)

In [None]:
len(X_train),len(X_test),len(y_train),len(y_test)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=X[0].shape, name='input_layer'),
    tf.keras.layers.Dense(200, activation='relu', name='Dense_Layer',
                          activity_regularizer=tf.keras.regularizers.l2()),
    # tf.keras.layers.Dropout(rate=0.2, name="Dropout"),
    tf.keras.layers.Dense(100, activation='relu', name='Dense_Layer2',
                          activity_regularizer=tf.keras.regularizers.l2()),
    # tf.keras.layers.Dropout(rate=0.2, name="Dropout2"),
    tf.keras.layers.Dense(12, activation='sigmoid', name='Dense_Layer3'),

  ], name='LASER_Model')

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
print(model.summary())

In [None]:
history = model.fit(X_train, y_train, batch_size=8, epochs=20, verbose=1, validation_split=0.2)

In [None]:
score = model.evaluate(X_test, y_test, verbose=1)

print("Test Score:", score[0])
print("Test Accuracy:", score[1])

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])

plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()