In [1]:
import os 
import re
import string
import random 
import time 
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

%matplotlib inline 
import warnings
warnings.filterwarnings('ignore')


import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from wordcloud import WordCloud

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf 
from tensorflow import keras 
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.layers import Flatten, Embedding, Dropout
from tensorflow.keras.layers import Conv1D, SpatialDropout1D
from tensorflow.keras.layers import Dense, Input 
from tensorflow.keras.layers import GlobalMaxPooling2D, GlobalAveragePooling1D
from tensorflow.keras.layers import LSTM, Dropout, GRU, Bidirectional

  import pandas.util.testing as tm


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
np.random.seed(42)
tf.random.set_seed(42)

In [3]:
path = '/content/drive/My Drive/Deep Learning - Projetos/Classificação de Texto - Twitter /training.1600000.processed.noemoticon.csv'
data = pd.read_csv(path, encoding='latin', header=None)

In [4]:
data.columns = ['sentiment', 'id', 'date', 'query', 'user_id', 'text']
data.head()

Unnamed: 0,sentiment,id,date,query,user_id,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [5]:
data.drop(['id', 'date', 'query', 'user_id'], axis=1, inplace=True)

In [6]:
# Regex sub

text_cleaning_re = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
texto = 'Felipe@ foi a escola@#4 ontem*['

limpeza = re.sub(text_cleaning_re ,' ', texto)

print(texto,'\n',limpeza)

Felipe@ foi a escola@#4 ontem*[ 
 Felipe foi a escola  ontem 


In [7]:
# Stemmer and Stopwords
stop_words = stopwords.words('english')
stemmer = SnowballStemmer('english')

In [8]:
text_cleaning_re = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"


def preprocess(text, stem=False):
  text = re.sub(text_cleaning_re, ' ', str(text).lower()).strip()
  tokens = []
  for token in text.split():
    if token not in stop_words:
      if stem:
        tokens.append(stemmer.stem(token))
      else: 
        tokens.append(token)
  return " ".join(tokens)

In [9]:
data['text'] = data['text'].apply(lambda x: preprocess(x, stem=False))

In [10]:
# train and test 

X = data['text']
y = data['sentiment']


X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.30, random_state=42)


encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

print('Train: {}'.format(X_train.shape))
print('Teste: {}'.format(X_test.shape))

Train: (1120000,)
Teste: (480000,)


In [11]:
# Tokenizador 
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

# Índice de palavras 
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
print('Vocabulary size: {}'.format(vocab_size))

Vocabulary size: 266578


In [12]:
max_sequence_length = 50 

sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_test = tokenizer.texts_to_sequences(X_test)

X_train = pad_sequences(sequences_train, maxlen=max_sequence_length, padding='post')
X_test = pad_sequences(sequences_test, maxlen=max_sequence_length, padding='post')


print('Maior Sequência: {}'.format(len(max(data.text))))
print('Sequência definida: {}'.format(max_sequence_length))

Maior Sequência: 58
Sequência definida: 50


In [13]:
# identifiando sentimentos [Positivo = 1 | Negativo = 0]
for x,y in zip(y_train[0:3], X_train[0:3]):
  print('Sentiment {} ----- {}'.format(x,y))

Sentiment 1 ----- [96797  1485  5501   300  1485   201  1407   386   520  6258    25   134
  1014  2632    20   748   201   599     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0]
Sentiment 0 ----- [  136 65783 39168  1390   254  3908   496  2087  1276     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0]
Sentiment 0 ----- [   23   194   114 28091  6007   488    28   200   146 28091     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0]


In [14]:
# índice de palavras
word_index

{'good': 1,
 'day': 2,
 'get': 3,
 'like': 4,
 'go': 5,
 'quot': 6,
 'today': 7,
 'work': 8,
 'going': 9,
 'love': 10,
 'got': 11,
 'lol': 12,
 'time': 13,
 'back': 14,
 'u': 15,
 'one': 16,
 'know': 17,
 'im': 18,
 'really': 19,
 'amp': 20,
 'see': 21,
 'night': 22,
 'still': 23,
 '2': 24,
 'well': 25,
 'new': 26,
 'want': 27,
 'think': 28,
 'home': 29,
 'thanks': 30,
 'oh': 31,
 'much': 32,
 'miss': 33,
 'need': 34,
 'last': 35,
 'morning': 36,
 'tomorrow': 37,
 'hope': 38,
 'great': 39,
 'twitter': 40,
 '3': 41,
 'haha': 42,
 'feel': 43,
 'sad': 44,
 'fun': 45,
 'wish': 46,
 'sleep': 47,
 'right': 48,
 'would': 49,
 'bad': 50,
 'happy': 51,
 'sorry': 52,
 'tonight': 53,
 'come': 54,
 'make': 55,
 'way': 56,
 'getting': 57,
 'gonna': 58,
 'though': 59,
 'nice': 60,
 'better': 61,
 'watching': 62,
 'yeah': 63,
 'bed': 64,
 'wait': 65,
 'could': 66,
 'week': 67,
 'people': 68,
 'school': 69,
 'hate': 70,
 'hey': 71,
 'days': 72,
 'even': 73,
 '4': 74,
 'next': 75,
 'yes': 76,
 'weekend

In [15]:
# De texto para token
for x,y in zip(data.text[20:22], sequences_test):
  print('{}. --> {}.'.format(x,y))

day get much done. --> [694, 38, 96].
one friend called asked meet mid valley today time sigh. --> [110, 138, 1706, 25597, 24].


<br>
<hr>
<br>


### Embedding - Glove 


<br>

In [16]:
# Word Embedding at Stanford
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2020-08-31 01:11:15--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2020-08-31 01:11:16--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2020-08-31 01:11:16--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip.1’


2020

In [17]:
# Glove Embedding
GLOVE_EMB = '/content/glove.6B.300d.txt'
EMBEDDING_DIM = 300 

calculamos um índice de mapeamento de palavras para embeddings conhecidos, analisando o despejo de dados de embeddings pré-treinados

In [18]:
# preparing Embedding 

embeddings_index = {}

f = open(GLOVE_EMB)

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Encontrado {} vetores de palavras.'.format(len(embeddings_index)))

Encontrado 400000 vetores de palavras.


podemos aproveitar nosso embedding_index dicionário e nosso word_index para calcular nossa matriz de incorporação.


* Criando Matrix de Embedding (Glove) 

<br>

In [19]:
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))

for word, i in word_index.items():
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

Carregamos essa matriz de incorporação (Embedding Matrix) em uma Embedding camada. Observe que definimos <b>trainable=False</b> para evitar que os pesos sejam atualizados durante o treinamento.


* trainable definido como true, os pesos seriam atualizados do modelo Glove

<br>

In [31]:
# Embedding layer (Glove)

embedding_layer = Embedding(vocab_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=max_sequence_length,
                            trainable=False)



# Embedding layer (Not pre-trained)
"""
embedding_layer = Embedding(vocab_size,
                            EMBEDDING_DIM,
                            input_length=max_sequence_length)"""

In [32]:
# LSTM 

model = Sequential()
model.add(Input(shape=max_sequence_length))
model.add(embedding_layer)
model.add(SpatialDropout1D(0.20))
model.add(LSTM(units=64, recurrent_dropout=0.20))
model.add(Dense(units=512, activation='relu'))
model.add(Dropout(0.20))
model.add(Dense(units=512, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))

model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 50, 300)           79973400  
_________________________________________________________________
spatial_dropout1d_3 (Spatial (None, 50, 300)           0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 64)                93440     
_________________________________________________________________
dense_14 (Dense)             (None, 512)               33280     
_________________________________________________________________
dropout_5 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_15 (Dense)             (None, 512)               262656    
_________________________________________________________________
dense_16 (Dense)             (None, 1)                

In [36]:
%%time


model.compile(optimizer=Adam(0.001),
              loss=BinaryCrossentropy(),
              metrics=['accuracy'])


history = model.fit(X_train, y_train,
                    batch_size=1024,
                    epochs=5,
                    validation_data=(X_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 2h 42min 55s, sys: 8min, total: 2h 50min 55s
Wall time: 1h 29min 48s


<br>
<hr>
<hr>
<br>