# Imports

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from tensorflow.keras import models,layers
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import regularizers

from gensim.models import Word2Vec

import re
import nltk
from nltk.tokenize import word_tokenize

2022-03-21 11:15:48.672537: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-03-21 11:15:48.673122: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


# General

In [2]:
df=pd.read_csv('../thermofeeler/data/encoded_df.csv')
X=df['tweet_text']
y=pd.get_dummies(df['encoded_sentiment'])

In [3]:
def preproc_func(tweet):
    '''Does the preprocessing of the tweets'''

    # stopwords: remove articles, prepositions, conjunctions etc
    stopwords=['a','te','tu','tua','tuas','tém','um','uma','você','vocês','vos','à','às','ao','aos',
          'aquela','aquelas','aquele','aqueles','aquilo','as','até','com','como','da','das','de',
          'dela','delas','dele','deles','depois','do','dos','e','ela','elas','ele','eles','em',
          'entre','essa','essas','esse','esses','esta','eu','foi','fomos','for','fora','foram',
          'forem','formos','fosse','fossem','fui','fôramos','fôssemos', 'isso','isto','já','lhe',
          'lhes','me','mesmo','meu','meus','minha','minhas','muito','na','nas','no','nos','nossa',
          'nossas','nosso','nossos','num','numa','nós','o','os','para','pela','pelas','pelo','pelos',
          'por','qual','quando','que','quem','se','seja','sejam','sejamos','sem','serei','seremos',
          'seria','seriam','será','serão','seríamos','seu','seus','somos','sou','sua','suas','são',
          'só','também','ah','q','g','oh','eh','vc','tbm','também','tambem','voceh','você','voce']

    tweet = tweet.lower() # lowercase

    tweet=re.sub('https?://[A-Za-z0-9./]+','',tweet) # remove links que começam com https?://
    tweet=re.sub('https://[A-Za-z0-9./]+','',tweet) # remove links que começam com https://
    tweet=re.sub('http://[A-Za-z0-9./]+','',tweet) # remove links que começam com http://

    tweet = re.sub(r'@[A-Za-z0-9_]+','',tweet) # remove @mentions
    tweet = re.sub(r'#','',tweet) # remove #hashtags

    tweet = re.sub(r'[^\w\s]','',tweet) # remove remove punctuation
    tweet = re.sub(r'[0-9]','',tweet) # remove numbers

    word_tokens=word_tokenize(tweet) # tokenize

    filtered_tweet = [w for w in word_tokens if not w in stopwords] # remove stopwords

    return filtered_tweet

In [5]:
tweet_list=[]
for x in X:
    tweet_list.append(preproc_func(x))
X=tweet_list

In [6]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

# Testing with Embeding

In [7]:
tk=Tokenizer()
tk.fit_on_texts(X_train)
vocab_size = len(tk.word_index)
vocab_size

77338

In [8]:
X_train_token = tk.texts_to_sequences(X_train)
X_train_embedding = pad_sequences(X_train_token, dtype='float32', padding='post',maxlen=45)

In [9]:
max_len=np.array(X_train_embedding).shape[1]
max_len

45

In [13]:
def init_model(learning_rate=0.001):
    model=models.Sequential()
    
    model.add(layers.Embedding(input_dim=vocab_size+1,input_length=max_len,output_dim=100,mask_zero=True))
    
    model.add(layers.LSTM(50))
        
    model.add(layers.Dense(20, activation='relu'))
    
    model.add(layers.Dense(3,activation='softmax'))
    
    rmsprop=RMSprop(learning_rate=learning_rate)
    
    model.compile(loss='categorical_crossentropy',optimizer=rmsprop,metrics=['accuracy'])
    
    return model

In [14]:
es=EarlyStopping(patience=3,restore_best_weights=True,verbose=1)

model=init_model(learning_rate=0.01)

history=model.fit(x=X_train_embedding,y=y_train,
          validation_split=0.2,
          batch_size=256,epochs=1000,
          callbacks=[es],
          use_multiprocessing=True,
          verbose=1)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000


KeyboardInterrupt



In [None]:
X_test_token = tk.texts_to_sequences(X_test)
X_test_embedding = pad_sequences(X_test_token, dtype='float32', padding='post',maxlen=max_len)
model.evaluate(X_test_embedding,y_test)

In [None]:
import pickle

# saving
with open('../tokenizer.pickle', 'wb') as handle:
    pickle.dump(tk, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [None]:
# import joblib

# joblib.dump(model,'../model.joblib')

model.save('../model')

# Testing with Word2Vec

In [None]:
# Train Word2Vec
w2v = Word2Vec(sentences=X_train,min_count=10)

# Function to convert a sentence (list of words) into a matrix representing the words in the embedding space
def embed_sentence(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec.wv:
            embedded_sentence.append(word2vec.wv[word])
        
    return np.array(embedded_sentence)

# Function that converts a list of sentences into a list of matrices
def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences:
        embedded_sentence = embed_sentence(word2vec, sentence)
        embed.append(embedded_sentence)
        
    return embed

# Embed the training and test sentences
X_train_embed = embedding(w2v, X_train)
X_test_embed = embedding(w2v, X_test)


# Pad the training and test embedded sentences
max_len=50
X_train_wv = pad_sequences(X_train_embed, dtype='float32', padding='post', maxlen=max_len)
X_test_wv = pad_sequences(X_test_embed, dtype='float32', padding='post', maxlen=max_len)

In [None]:
input_shape=X_test_wv.shape[1:]

In [None]:
def init_model2(learning_rate=0.001):
    model=models.Sequential()

    model.add(layers.Masking())
        
    model.add(layers.LSTM(100,return_sequences=True))
    model.add(layers.LSTM(80))

    model.add(layers.Dense(50, activation='relu'))
    
    model.add(layers.Dense(3,activation='softmax'))
    
    rmsprop=RMSprop(learning_rate=learning_rate)
    
    model.compile(loss='categorical_crossentropy',optimizer=rmsprop,metrics=['accuracy'])
    
    return model

In [None]:
es=EarlyStopping(patience=3,restore_best_weights=True,verbose=1)

model2=init_model2()
history=model2.fit(x=X_train_wv,y=y_train,
          validation_split=0.2,
          batch_size=128,epochs=1000,
          callbacks=[es],
          use_multiprocessing=True,
          verbose=1)

In [None]:
model2.evaluate(X_test_wv,y_test)