# Imports

In [38]:
import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding, Flatten
from keras.models import Model
import pandas as pd
import numpy as np

# Load Dataset to Memory

In [39]:
raw_data = pd.read_csv("file.csv")
labels_index = {
	"bad": 0,
	"neutral": 1,
	"good": 2
}
# texts = []
# labels = []
label_to_id = np.vectorize(lambda x: labels_index[x])
texts = raw_data["tweets"].values
labels = label_to_id(raw_data["labels"].values)


# Tokenizing

In [87]:
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index # the dictionary
print('Found %s unique tokens.' % len(word_index)) #only top MAX_NUM_WORDS will be used to generate the sequences
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of samples:', data.shape)
print('Sampele:(the zeros at the begining are for padding text to max length)')
print(data[2])

Found 284178 unique tokens.
Shape of samples: (219294, 1000)
Sampele:(the zeros at the begining are for padding text to max length)
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0 

# Converting labels to categorical arrays

In [41]:
labels_matrix = to_categorical(np.asarray(labels))
labels_matrix

array([[0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       ...,
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.]], dtype=float32)

# Train-Test split

In [42]:
VALIDATION_SPLIT = 0.2
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data_shuffled = data[indices]
labels_shuffled = labels_matrix[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data_shuffled.shape[0])
x_train = data_shuffled[:-nb_validation_samples]
y_train = labels_shuffled[:-nb_validation_samples]
x_val = data_shuffled[-nb_validation_samples:]
y_val = labels_shuffled[-nb_validation_samples:]
print('Shape of training data: ',x_train.shape)
print('Shape of testing data: ',x_val.shape)

Shape of training data:  (175436, 1000)
Shape of testing data:  (43858, 1000)


# Reading Glove Data

In [43]:
args = {} if sys.version_info < (3,) else {'encoding': 'latin-1'}
EMBEDDING_DIM = 100
print('Indexing word vectors.')
embeddings_index = {}
with open(f'glove.6B.{EMBEDDING_DIM}d.txt', **args) as f:
	for line in f:
		values = line.split(sep=' ')
		word = values[0]
		coefs = np.asarray(values[1:], dtype='float32')
		embeddings_index[word] = coefs
print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.


In [44]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))#+1 to include the zerors vector for non-existing words
for word, i in word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		# words not found in embedding index will be all-zeros.
		embedding_matrix[i] = embedding_vector
print ('Shape of Embedding Matrix: ',embedding_matrix.shape)

Shape of Embedding Matrix:  (284179, 100)


In [45]:
embedding_layer = Embedding(len(word_index) + 1, #vocab size
	EMBEDDING_DIM, #embedding vector size
	weights=[embedding_matrix], #weights matrix
	input_length=MAX_SEQUENCE_LENGTH, #padded sequence length
	trainable=False)

In [46]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(35)(x) # global max pooling
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(len(labels_index), activation='softmax')(x)

In [47]:
model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
	optimizer='rmsprop',
	metrics=['acc'])
# happy learning!
model.fit(x_train, y_train, validation_data=(x_val, y_val),
	epochs=5, batch_size=128)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1fd36696830>

In [48]:
print('Acuracy on testing set:')
model.evaluate(x_val,y_val)

Acuracy on testing set:


[0.5749507546424866, 0.7758219838142395]

# Predicting Instance

In [103]:
test_instance = "So cute, ChatGPT by @OpenAI knows how to play MahJong, even the diff between Sichuan and Japanese rules ðŸ¤£\nExplained better than most MahJong tutorials ðŸ¤£ #ChatGPT https://t.co/tAzC6JQtYB"
test_row = [test_instance]
sequences2 = tokenizer.texts_to_sequences(test_row)
data2 = pad_sequences(sequences2, maxlen=MAX_SEQUENCE_LENGTH)
label_vec = model.predict(data2[0].reshape(1,-1))
label_id = np.argmax(label_vec)
label_name = ''
for name, ID in labels_index.items():
	if label_id == ID:
		label_name = name
		break
label_name


'good'