In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!ln --symbolic /content/drive/MyDrive/UnB/PLN/projeto_final/ /dir
%cd /dir/.
!ls .

/content/drive/.shortcut-targets-by-id/1yd_wnDhuc_GPjRYs6OAA3jtLMaLl8SXF/UnB/PLN/projeto_final
LSTM_Classifier		       Mental-Health-Twitter-pre-processed.csv
Mental-Health-Twitter.csv      projeto_final
Mental-Health-Twitter.csv.zip


# Imports

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import matplotlib.pyplot as plt

from keras.models import Sequential
from keras.layers import Dense, Embedding, Input, Conv1D, MaxPooling1D, GlobalMaxPooling1D, BatchNormalization
from keras.utils.np_utils import to_categorical
from tensorflow.keras.optimizers import Adam

from keras.utils.np_utils import to_categorical
from sklearn.metrics import classification_report

In [4]:
df = pd.read_csv('Mental-Health-Twitter-pre-processed.csv')
df = df.drop('Unnamed: 0', axis=1)

In [5]:
df.columns

Index(['post_id', 'post_created', 'post_text', 'user_id', 'followers',
       'friends', 'favourites', 'statuses', 'retweets', 'label', 'is_retweet',
       'tweet_num_words'],
      dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19929 entries, 0 to 19928
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   post_id          19929 non-null  int64 
 1   post_created     19929 non-null  object
 2   post_text        19929 non-null  object
 3   user_id          19929 non-null  int64 
 4   followers        19929 non-null  int64 
 5   friends          19929 non-null  int64 
 6   favourites       19929 non-null  int64 
 7   statuses         19929 non-null  int64 
 8   retweets         19929 non-null  int64 
 9   label            19929 non-null  int64 
 10  is_retweet       19929 non-null  int64 
 11  tweet_num_words  19929 non-null  int64 
dtypes: int64(10), object(2)
memory usage: 1.8+ MB


In [7]:
random_seed = 168
x_train, x_rest, y_train, y_rest = train_test_split(df['post_text'], df['label'], random_state = random_seed, test_size=0.2, shuffle=True)

In [8]:
x_valid, x_test, y_valid, y_test = train_test_split(x_rest, y_rest, random_state = random_seed, test_size=0.5, shuffle=True)

In [9]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)

In [10]:
train_sequences = tokenizer.texts_to_sequences(x_train)

Em um notebook anterior, o tamanho estimado para os vetores na camada de embedding já tinha sido estimado (em 27), logo, o mesmo tamanho será usado aqui 

In [11]:
max_len = 27
trunc_type = 'post'
padding_type = 'post'
train_padded = pad_sequences(train_sequences, maxlen= max_len, padding=padding_type, truncating=trunc_type)

In [12]:
valid_sequences = tokenizer.texts_to_sequences(x_valid)
test_sequences = tokenizer.texts_to_sequences(x_test)

valid_padded = pad_sequences(valid_sequences, maxlen=max_len, padding=padding_type, truncating=trunc_type)
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding=padding_type, truncating=trunc_type)

## Tratando a saida

In [13]:
y_train = pd.Categorical(y_train)
y_train_int = y_train.codes

y_valid = pd.Categorical(y_valid)
y_valid_int = y_valid.codes

y_test = pd.Categorical(y_test)
y_test_int = y_test.codes

In [14]:
y_train = to_categorical(y_train_int)
y_valid = to_categorical(y_valid_int)
y_test = to_categorical(y_test_int)

# Construindo e Treinando a Rede

In [30]:
model = Sequential(layers = [
    Input(shape=[max_len]),
    Embedding(input_dim = len(tokenizer.word_counts), output_dim = 512),
    Conv1D(kernel_size = 3, filters = 128, activation='relu', padding='same', strides=4),
    BatchNormalization(),
    MaxPooling1D(pool_size=3, strides=2),

    Conv1D(kernel_size = 6, filters = 128, activation='relu', padding='same'),
    BatchNormalization(),
    MaxPooling1D(pool_size=3, strides=2),

    Conv1D(kernel_size = 12, filters = 128, activation='relu', padding='same'),
    # MaxPooling1D(pool_size=3, strides=2),
    GlobalMaxPooling1D(),

    Dense(units=512, activation='relu'),
    Dense(units=256, activation='relu'),
    Dense(units=16,  activation='relu'),
    Dense(units=2,   activation='softmax'),
], name = "RNN_Classifier")

model.summary()

Model: "RNN_Classifier"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 27, 512)           8287744   
                                                                 
 conv1d_27 (Conv1D)          (None, 7, 128)            196736    
                                                                 
 batch_normalization_18 (Bat  (None, 7, 128)           512       
 chNormalization)                                                
                                                                 
 max_pooling1d_18 (MaxPoolin  (None, 3, 128)           0         
 g1D)                                                            
                                                                 
 conv1d_28 (Conv1D)          (None, 3, 128)            98432     
                                                                 
 batch_normalization_19 (Bat  (None, 3, 128)        

In [31]:
adam_optimizer = Adam(learning_rate=0.000001)
model.compile(loss="categorical_crossentropy",optimizer=adam_optimizer,metrics=['accuracy'])
model.fit(train_padded, y_train, validation_data=(valid_padded, y_valid), epochs=250)

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78

<keras.callbacks.History at 0x7f99162b9dd0>

In [32]:
y_prob = model.predict(test_padded)
# y_pred = np.argmax(y_prob,axis=1)
y_pred = np.where(y_prob >= 0.5, 1, 0)

In [33]:
print(classification_report(y_test,y_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.66      0.65      0.65      1017
           1       0.64      0.65      0.64       976

   micro avg       0.65      0.65      0.65      1993
   macro avg       0.65      0.65      0.65      1993
weighted avg       0.65      0.65      0.65      1993
 samples avg       0.65      0.65      0.65      1993

