In [177]:
# Imports Train
from tweet_data_label import train_data

In [178]:
# Imports Dependencies
import spacy
import re
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [179]:
# Quick Validation on Tweet Data
def tweet_data_validate(data):
    print(f'Length: {len(data)}')

In [180]:
tweet_data_validate(train_data)

Length: 101


In [181]:
# Load up spACY library
nlp = spacy.load('en')

In [182]:
# Organize Tweet Data
def tweet_clean(text):
    lower = []
    text = text.replace('&amp;', 'and')

    for token in nlp(text):
        lower.append(token.text.lower())
            
    return lower

def organize_tweet_data(train_data):
    affirmation_map = {
        'POSITIVE': 0,
        'NEGATIVE': 1,
        'NULL': 2
    }
    tweet_sequences = []
    target_affirm = []

    # Organize tweets & train data into arrays
    for tweet_data in train_data:
        tweet_sequences.append(tweet_clean(tweet_data[0]))
        affirmation = tweet_data[1]['affirmation'][0][0] # Only first affirmation
        target_affirm.append(affirmation_map[affirmation])
        
    return [tweet_sequences, target_affirm]

In [183]:
# Tokenize & Pad Tweet Data
tweet_sequences, target_affirm = organize_tweet_data(train_data)

tokenizer = Tokenizer()

def tokenize_tweets(tokenizer, data):
    tokenizer.fit_on_texts(data)
    
    return tokenizer.texts_to_sequences(data)
    
def create_padded_array(length):
    zeros_arr = []

    i = 0
    while i < length:
        zeros_arr.append(0)
        i += 1

    return zeros_arr

def pad_tokenized_tweets(data):
    max_len = 0

    # Find longest sequence, pad everything to its length
    for sequence in data:
        seq_len = len(sequence)
        if seq_len > max_len:
            max_len = seq_len

    zeros_to_add = max_len - len(data)
    zeros_arr = create_padded_array(zeros_to_add)

    return [*sequence,*zeros_arr]

tokenized_data = tokenize_tweets(tokenizer, tweet_sequences)
pad_tokenized_tweets(tokenized_data)

X = [pad_zeros(sequence) for sequence in sequences_x]

In [184]:
# Check size of tokenized vocab
vocab_size = len(tokenizer.index_word)
print(vocab_size)

640


In [185]:
# Convert tokenized data to Numpy Array
X = np.array(sequences_x)
y = to_categorical(target_affirm)

In [186]:
# Validate Data
print(f'X shape: {x.shape}')
print(f'y shape: {y.shape}')

X shape: (101, 34)
y shape: (101, 3)


In [187]:
# Setup Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=42)

In [188]:
# Validate Data
print('X_train')
print(f'X_train len: {len(X_train)}')
# print(X_train)
print('')
print('y_train')
print(f'y_train len: {len(y_train)}')
# print(y_train)


X_train
X_train len: 67

y_train
y_train len: 67


In [189]:
# Scale Train Data
def scale_data():
    scaler_object = MinMaxScaler()
    scaler_object.fit(X_train)
    
    scaled_X_train = scaler_object.transform(X_train)
    scaled_X_test = scaler_object.transform(X_test)
    return [scaled_X_train, scaled_X_test]

scaled_X_train, scaled_X_test = scale_data()

In [190]:
# Setup Training Model
def create_model():
    model = Sequential()
    model.add(Dense(8,input_dim=34,activation='relu'))
    model.add(Dense(8,input_dim=34,activation='relu'))
    model.add(Dense(3,activation='softmax'))
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    return model

create_model()

<keras.engine.sequential.Sequential at 0x7f98699d4150>

In [191]:
# Review Model
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 8)                 280       
_________________________________________________________________
dense_5 (Dense)              (None, 8)                 72        
_________________________________________________________________
dense_6 (Dense)              (None, 3)                 27        
Total params: 379
Trainable params: 379
Non-trainable params: 0
_________________________________________________________________
None


In [192]:
# Train Data
model.fit(scaled_X_train,y_train,epochs=50,verbose=2)

Epoch 1/50
 - 0s - loss: 0.3614 - accuracy: 0.8657
Epoch 2/50
 - 0s - loss: 0.3603 - accuracy: 0.8657
Epoch 3/50
 - 0s - loss: 0.3590 - accuracy: 0.8657
Epoch 4/50
 - 0s - loss: 0.3577 - accuracy: 0.8657
Epoch 5/50
 - 0s - loss: 0.3566 - accuracy: 0.8657
Epoch 6/50
 - 0s - loss: 0.3557 - accuracy: 0.8657
Epoch 7/50
 - 0s - loss: 0.3542 - accuracy: 0.8657
Epoch 8/50
 - 0s - loss: 0.3529 - accuracy: 0.8657
Epoch 9/50
 - 0s - loss: 0.3518 - accuracy: 0.8657
Epoch 10/50
 - 0s - loss: 0.3502 - accuracy: 0.8657
Epoch 11/50
 - 0s - loss: 0.3491 - accuracy: 0.8657
Epoch 12/50
 - 0s - loss: 0.3481 - accuracy: 0.8657
Epoch 13/50
 - 0s - loss: 0.3467 - accuracy: 0.8657
Epoch 14/50
 - 0s - loss: 0.3454 - accuracy: 0.8657
Epoch 15/50
 - 0s - loss: 0.3441 - accuracy: 0.8657
Epoch 16/50
 - 0s - loss: 0.3431 - accuracy: 0.8657
Epoch 17/50
 - 0s - loss: 0.3421 - accuracy: 0.8657
Epoch 18/50
 - 0s - loss: 0.3412 - accuracy: 0.8657
Epoch 19/50
 - 0s - loss: 0.3405 - accuracy: 0.8657
Epoch 20/50
 - 0s - l

<keras.callbacks.callbacks.History at 0x7f98697de990>

In [194]:
# Predictions
model.predict_classes(scaled_X_test)
predictions = model.predict_classes(scaled_X_test)

print('Results')
print(f'X : {predictions} {len(predictions)}')
print(f'Y : {y_test.argmax(axis=1)}\n')

print('Confusion Matrix')
print(f'{confusion_matrix(y_test.argmax(axis=1),predictions)}\n')

print('Classification Report')
print(classification_report(y_test.argmax(axis=1),predictions))

print('Accuracy Score')
print(accuracy_score(y_test.argmax(axis=1),predictions))

Results
X : [0 0 2 2 2 2 0 2 0 2 0 0 2 0 2 2 2 2 0 0 2 2 0 2 0 0 2 0 0 2 0 2 2 0] 34
Y : [0 2 2 2 2 2 0 2 0 2 0 0 2 0 2 2 0 2 0 0 2 1 0 2 0 0 2 2 0 2 0 0 2 0]

Confusion Matrix
[[14  0  2]
 [ 0  0  1]
 [ 2  0 15]]

Classification Report
              precision    recall  f1-score   support

           0       0.88      0.88      0.88        16
           1       0.00      0.00      0.00         1
           2       0.83      0.88      0.86        17

    accuracy                           0.85        34
   macro avg       0.57      0.59      0.58        34
weighted avg       0.83      0.85      0.84        34

Accuracy Score
0.8529411764705882
