# Validate API Data

Validate and create a emotion labeled dataset

In [55]:
from pathlib import Path
import pandas as pd

In [56]:
# Add project path to the PYTHONPATH

import os
import sys
from pathlib import Path

sys.path.append(Path(os.path.join(os.path.abspath(''), '../')).resolve().as_posix())

In [57]:
import json
from pathlib import Path

# Load Relations

Load the relations between queries and emotions

In [58]:
relations_path = Path('../query_relations.json').resolve()

In [59]:
with relations_path.open('rb') as file:
    relations = json.load(file)

# Load Tokenizer

Load the tokenizer, created at the model training process

In [60]:
import pickle

In [61]:
tokenizer_path = Path('../datasets/sentiment140/tokenizer.pickle').resolve()
with tokenizer_path.open('rb') as file:
    tokenizer = pickle.load(file)

# Load Model

Load the model, using the saved weights

In [62]:
from tensorflow.keras.layers import Input, Embedding, GRU
from tensorflow.keras.layers import Dropout, GlobalMaxPooling1D
from tensorflow.keras.layers import Bidirectional, Dense
from tensorflow.keras.models import Sequential

In [63]:
input_dim = min(tokenizer.num_words, len(tokenizer.word_index) + 1)
embedding_dim = 200
input_length = 100
gru_units = 128
gru_dropout = 0.1
recurrent_dropout = 0.1
dropout = 0.1

In [64]:
model = Sequential()
model.add(Embedding(
    input_dim=input_dim,
    output_dim=embedding_dim,
    input_shape=(input_length,)
))

model.add(Bidirectional(GRU(
    gru_units,
    return_sequences=True,
    dropout=gru_dropout,
    recurrent_dropout=recurrent_dropout
)))
model.add(GlobalMaxPooling1D())
model.add(Dense(32, activation='relu'))
model.add(Dropout(dropout))

model.add(Dense(1, activation='sigmoid'))

In [65]:
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 200)          2000000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 256)          252672    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                8224      
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total params: 2,260,929
Trainable params: 2,260,929
Non-trainable params: 0
____________________________________________

In [66]:
weights_path = Path('../models/sentiment_analysis/gru_model.h5').resolve() # change name to model_weights
model.load_weights(weights_path.as_posix())

# Group data by emotion

Use the emotions to group the data

In [67]:
import os
import re
import pandas as pd
from tqdm import tqdm

In [68]:
files_dir = Path('../datasets/tweepy').resolve()

In [70]:
emotion_data_dict = {}

filenames = os.listdir(files_dir)
with tqdm(total=len(filenames)) as t:
    for filename in filenames:
        query = re.findall(r'(#[^.]+|@.+@)', filename)[0].replace('@', ':')
        emotion = relations[query]

        file_data = pd.read_csv(os.path.join(files_dir, filename))
        dict_data = emotion_data_dict[emotion] if emotion in emotion_data_dict else None
        emotion_data_dict[emotion] = pd.concat([dict_data, file_data])
        t.update()

100%|██████████| 17/17 [00:00<00:00, 18.21it/s]


## Predict emotion and filter data

Predict emotion and filter rows for each group created in the step above

In [71]:
import re
import numpy as np
from emoji import demojize
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nlp import preprocess

In [72]:
def get_score_range(mean):
  if mean < 0.5:
    return (0.0, mean)
  return (mean, 1.0)

In [73]:
result_data = []

messages = []
with tqdm(total=len(emotion_data_dict.items())) as t:
    for emotion, dataset in emotion_data_dict.items():
        t.set_description('Processing "' + emotion + '" data')

        cleaned_texts = preprocess(dataset.text, quiet=True)
        predict_sequences = [text.split() for text in cleaned_texts]
        list_tokenized_predict = tokenizer.texts_to_sequences(predict_sequences)
        x_predict = pad_sequences(list_tokenized_predict, maxlen=100)

        result = model.predict(x_predict)
        mean = np.mean(result)
        std = np.std(result)
        low, high = get_score_range(mean)
        messages.append(emotion.capitalize() + ": Score Range: {:4f} - {:4f}".format(low, high))
        dataset = dataset[np.all([(result >= low), (result <= high)], axis=0)]
        dataset.insert(0, 'label', emotion)

        result_data = result_data + [dataset]
        t.update()

for message in messages:
    print(message)

Processing "joy" data: 100%|██████████| 4/4 [06:56<00:00, 107.64s/it]    


Anger: Score Range: 0.000000 - 0.475587
Fear: Score Range: 0.501809 - 1.000000
Sadness: Score Range: 0.000000 - 0.420382
Joy: Score Range: 0.676242 - 1.000000


# Save dataset

Save the resulting data

In [74]:
if len(result_data) > 0:
    result_data = pd.concat(result_data)

    path = Path('../datasets/dataset_results.csv').resolve()
    result_data.to_csv(path, index=None)

    print('Files saved under "' + path.as_posix() + '"')

Files saved under "C:/Users/Araceli/Documents/Git/SentimentAnalysisTwitter/datasets/dataset_results.csv"


In [75]:
len(result_data)

80973