In [153]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.utils import shuffle
import re

In [154]:
data = pd.read_csv('/content/drive/MyDrive/Sentiment.csv')
data = data[['text','sentiment']]

data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
data['text'] = data['text'].str.replace('rt','')

In [155]:
print(data[['text','sentiment']])

                                                    text sentiment
0       nancyleegrahn how did everyone feel about the...   Neutral
1       scottwalker didnt catch the full gopdebate la...  Positive
2       tjmshow no mention of tamir rice and the gopd...   Neutral
3       robgeorge that carly fiorina is trending  hou...  Positive
4       danscavino gopdebate w realdonaldtrump delive...  Positive
...                                                  ...       ...
13866   cappy_yarbrough love to see men who will neve...  Negative
13867   georgehenryw who thought huckabee exceeded th...  Positive
13868   lrihendry tedcruz as president i will always ...  Positive
13869   jrehling gopdebate donald trump says that he ...  Negative
13870   lrihendry tedcruz headed into the presidentia...  Positive

[13871 rows x 2 columns]


In [156]:
data_majority = data[data['sentiment'] == 'Negative']
data_minority = data[data['sentiment'] == 'Positive']
data_neutral = data[data['sentiment'] == 'Neutral' ]

bias = data_minority.shape[0]/data_majority.shape[0]

train = pd.concat([
         data_majority.sample(frac=0.8,random_state=200),
         data_minority.sample(frac=0.8,random_state=200),
         data_neutral.sample(frac=0.8, random_state=200)
         ])

test = pd.concat([
         data_majority.drop(data_majority.sample(frac=0.8,random_state=200).index),
         data_minority.drop(data_minority.sample(frac=0.8,random_state=200).index),
         data_neutral.drop(data_neutral.sample(frac=0.8, random_state=200).index)
         ])

train = shuffle(train)
test = shuffle(test)

In [157]:
print('positive data in training:',(train.sentiment == 'Positive').sum())
print('negative data in training:',(train.sentiment == 'Negative').sum())
print('neutral data in training:',(train.sentiment == 'Neutral').sum())

print('positive data in test:',(test.sentiment == 'Positive').sum())
print('negative data in test:',(test.sentiment == 'Negative').sum())
print('neutral data in test:',(test.sentiment == 'Neutral').sum())

positive data in training: 1789
negative data in training: 6794
neutral data in training: 2514
positive data in test: 447
negative data in test: 1699
neutral data in test: 628


In [158]:
data_majority = train[train['sentiment'] == 'Negative']
data_minority = train[train['sentiment'] == 'Positive']
data_neutral = train[train['sentiment'] == 'Neutral']


print("majority class before upsample:",data_majority.shape)
print("minority class before upsample:",data_minority.shape)
print("neutral class before upsample:",data_neutral.shape)


data_minority_upsampled = resample(data_minority,
                                 replace=True,
                                 n_samples= data_majority.shape[0],
                                 random_state=123)

data_neutral_upsampled = resample(data_neutral,
                                 replace=True,
                                 n_samples= data_majority.shape[0],
                                 random_state=123)


data_upsampled = pd.concat([data_majority, data_minority_upsampled, data_neutral_upsampled])

print("After upsampling\n",data_upsampled.sentiment.value_counts(),sep = "")

majority class before upsample: (6794, 2)
minority class before upsample: (1789, 2)
neutral class before upsample: (2514, 2)
After upsampling
Negative    6794
Positive    6794
Neutral     6794
Name: sentiment, dtype: int64


In [159]:
data_upsampled.head()

Unnamed: 0,text,sentiment
11718,mhvadney kasichs answer on gay marriage was r...,Negative
13839,swincash so gopdebates got folks like thishmm...,Negative
635,whoisbenchang the gopdebate as theater madefo...,Negative
9278,supermanhotmale i lived here in florida for 8...,Negative
3136,realdonaldtrump did you realize that even foxn...,Negative


In [160]:
one_hot = pd.get_dummies(data_upsampled['sentiment'])
data_upsampled.drop(["sentiment"], axis=1, inplace=True)
data_upsampled = pd.concat([data_upsampled, one_hot], axis=1)

In [161]:
X = data_upsampled["text"].values
y = data_upsampled.drop("text", axis=1).values

In [162]:
max_features = 20000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(X)

X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=63)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42
)

In [163]:
print(y_train.shape)

(14267, 3)


In [164]:
from keras import layers

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [
                layers.Dense(ff_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [None]:
model_path = '/content/drive/MyDrive/sentu2.h5'

with tf.keras.utils.custom_object_scope({'TokenAndPositionEmbedding': TokenAndPositionEmbedding, 'TransformerBlock': TransformerBlock}):
    model = tf.keras.models.load_model(model_path)

In [None]:
inputs = model.input
outputs = model.output

num_classes = 3

outputs = tf.keras.layers.Dense(num_classes, activation='softmax')(outputs)
model = tf.keras.Model(inputs=inputs, outputs=outputs)

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
print(X_train.shape)
print(y_train.shape)

(14267, 63)
(14267, 3)


In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_dataset = train_dataset.batch(128)

print(train_dataset)

<_BatchDataset element_spec=(TensorSpec(shape=(None, 63), dtype=tf.int32, name=None), TensorSpec(shape=(None, 3), dtype=tf.uint8, name=None))>


In [None]:
num_epochs = 15

for epoch in range(num_epochs):
    total_loss = 0
    total_samples = 0

    for batch, (input_batch, label_batch) in enumerate(train_dataset):
        loss, accuracy = model.train_on_batch(input_batch, label_batch)

        total_loss += loss * input_batch.shape[0]
        total_samples += input_batch.shape[0]

    epoch_loss = total_loss / total_samples
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {accuracy:.4f}')

Epoch 1/15, Loss: 0.6671, Accuracy: 0.6441
Epoch 2/15, Loss: 0.6581, Accuracy: 0.6780
Epoch 3/15, Loss: 0.6449, Accuracy: 0.6780
Epoch 4/15, Loss: 0.6367, Accuracy: 0.6949
Epoch 5/15, Loss: 0.6354, Accuracy: 0.6780
Epoch 6/15, Loss: 0.6365, Accuracy: 0.6949
Epoch 7/15, Loss: 0.6361, Accuracy: 0.6441
Epoch 8/15, Loss: 0.6412, Accuracy: 0.7119
Epoch 9/15, Loss: 0.6191, Accuracy: 0.7966
Epoch 10/15, Loss: 0.5521, Accuracy: 0.8305
Epoch 11/15, Loss: 0.4742, Accuracy: 0.8644
Epoch 12/15, Loss: 0.4298, Accuracy: 0.8814
Epoch 13/15, Loss: 0.3887, Accuracy: 0.8983
Epoch 14/15, Loss: 0.3629, Accuracy: 0.8983
Epoch 15/15, Loss: 0.3415, Accuracy: 0.8983


In [None]:
model.save("/content/drive/MyDrive/sentfinal.h5")

In [169]:
text = ["The quality is bad. I hate the product"]
text = tokenizer.texts_to_sequences(text)
text = pad_sequences(text, maxlen=63, dtype='int32', value=0)
sentiment = model.predict(text,batch_size=1,verbose = 2)[0]

print(sentiment)

if (np.argmax(sentiment) == 0):
  print("negative")
elif (np.argmax(sentiment) == 1):
  print("neutral")
else:
  print("positive")

1/1 - 0s - 18ms/epoch - 18ms/step
[0.8251656  0.16309305 0.01174132]
negative
