In [84]:
# Imports Train
from tweet_data_label import train_data

In [85]:
# Imports Dependencies
import spacy
import re
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import class_weight

import keras
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [86]:
# Quick Validation on Tweet Data
def tweet_data_validate(data):
    print(f'Length: {len(data)}')

In [87]:
tweet_data_validate(train_data)

Length: 129


In [67]:
# Load up spACY library
nlp = spacy.load('en')

In [68]:
# Organize Data Methods
def tweet_clean(text):
    lower = []
    text = text.replace('&amp;', 'and')

    for token in nlp(text):
        lower.append(token.text.lower())
            
    return lower

# Only target affirmation data
def organize_tweet_data(train_data):
    affirmation_map = {
        'POSITIVE': 0,
        'NEGATIVE': 1,
        'NULL': 2
    }
    tweet_sequences = []
    target_affirm = []

    # Organize tweets & train data into arrays
    for tweet_data in train_data:
        tweet_sequences.append(tweet_clean(tweet_data[0]))
        affirmation = tweet_data[1]['affirmation'][0][0] # Only first affirmation
        target_affirm.append(affirmation_map[affirmation])
        
    return [tweet_sequences, target_affirm]

#  Tokenization Methods
def tokenize_tweets(tokenizer, data):
    tokenizer.fit_on_texts(data)
    
    return tokenizer.texts_to_sequences(data)
    
def create_zeros_array(length):
    zeros_arr = []

    i = 0
    while i < length:
        zeros_arr.append(0)
        i += 1

    return zeros_arr

def find_max_tweet_len(data):
    max_len = 0

    # Find longest sequence, pad everything to its length
    for tweet in data:
        seq_len = len(tweet)
        if seq_len > max_len:
            max_len = seq_len

    return max_len

def pad_array(data, max_len):
    zeros_len = max_len - len(data)
    zeros_arr = create_zeros_array(zeros_len)

    return [*data,*zeros_arr]

In [69]:
# Tokenize & Pad Tweet Data
tweet_sequences, target_affirm = organize_tweet_data(train_data)

tokenizer = Tokenizer()
tokenized_data = tokenize_tweets(tokenizer, tweet_sequences)

max_len = find_max_tweet_len(tokenized_data)

X = [pad_array(data, max_len) for data in tokenized_data]

In [70]:
# Check size of tokenized vocab
vocab_size = len(tokenizer.index_word)

print(f'Vocab Size: {vocab_size}')
print(f'Target Affirm Length: {len(target_affirm)}')

Vocab Size: 785
Target Affirm Length: 129


In [71]:
# Convert tokenized data to Numpy Array
X = np.array(X)
y = to_categorical(target_affirm)

In [72]:
# Validate Data
print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')

X shape: (129, 34)
y shape: (129, 3)


In [73]:
# Setup Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=42)

In [74]:
# Validate Data
print('X_train')
print(f'X_train len: {len(X_train)}')
# print(X_train)
print('')
print('y_train')
print(f'y_train len: {len(y_train)}')
# print(y_train)


X_train
X_train len: 86

y_train
y_train len: 86


In [75]:
# Scale Train Data
def scale_data():
    scaler_object = MinMaxScaler()
    scaler_object.fit(X_train)
    
    scaled_X_train = scaler_object.transform(X_train)
    scaled_X_test = scaler_object.transform(X_test)
    return [scaled_X_train, scaled_X_test]

scaled_X_train, scaled_X_test = scale_data()

In [76]:
# Setup Training Model
def create_model():
    model = Sequential()
    model.add(Dense(8,input_dim=34,activation='relu'))
    model.add(Dense(8,input_dim=34,activation='relu'))
    model.add(Dense(3,activation='softmax'))
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    return model

In [77]:
model = create_model()

# Review Model
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 8)                 280       
_________________________________________________________________
dense_8 (Dense)              (None, 8)                 72        
_________________________________________________________________
dense_9 (Dense)              (None, 3)                 27        
Total params: 379
Trainable params: 379
Non-trainable params: 0
_________________________________________________________________
None


In [92]:
# Setup Class Weights
# print(y)
# class_weights = class_weight.compute_class_weight('balanced', np.unique(y), y)
# print(class_weights)
# class_weight = {
#     0: 1.,
#     1: 50.,
#     2: 2.
# }

TypeError: unhashable type: 'numpy.ndarray'

In [None]:
# Train
model.fit(scaled_X_train,y_train,epochs=10,verbose=2, shuffle=True, class_weight)

In [79]:
# Predictions
model.predict_classes(scaled_X_test)
predictions = model.predict_classes(scaled_X_test)

print('Results')
print(f'X : {predictions} {len(predictions)}')
print(f'Y : {y_test.argmax(axis=1)}\n')

print('Confusion Matrix')
print(f'{confusion_matrix(y_test.argmax(axis=1),predictions)}\n')

print('Classification Report')
print(classification_report(y_test.argmax(axis=1),predictions))

print('Accuracy Score')
print(accuracy_score(y_test.argmax(axis=1),predictions))

Results
X : [2 2 2 2 1 2 0 2 0 2 0 2 2 2 2 2 2 0 2 0 0 2 2 2 2 2 0 2 2 2 0 2 2 0 2 0 2
 0 2 2 2 0 2] 43
Y : [2 2 0 0 2 0 0 0 0 0 2 0 2 2 2 1 2 0 0 0 0 0 2 0 2 2 0 2 2 2 0 2 0 0 2 0 1
 0 2 0 2 0 2]

Confusion Matrix
[[11  0 11]
 [ 0  0  2]
 [ 1  1 17]]

Classification Report
              precision    recall  f1-score   support

           0       0.92      0.50      0.65        22
           1       0.00      0.00      0.00         2
           2       0.57      0.89      0.69        19

    accuracy                           0.65        43
   macro avg       0.49      0.46      0.45        43
weighted avg       0.72      0.65      0.64        43

Accuracy Score
0.6511627906976745
