In [None]:
# Importing Libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Flatten, GRU, Bidirectional, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Layer
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
# Importing Dataset
df = pd.read_csv("/content/drive/MyDrive/Dataset/test.csv")
# df = pd.read_csv("/content/comments.csv")

In [10]:
# Data Info
print(df.shape)
print(df.columns)

(3534, 2)
Index(['text', 'sentiment'], dtype='object')


In [11]:
sentiment_mapping = {
    'positive': 1,
    'negative': 0,
    'neutral': 2
}

# Encode sentiments
df['sentiment'] = df['sentiment'].map(sentiment_mapping)

In [12]:
x = df["text"]
y = df["sentiment"]

In [13]:
x.head()

Unnamed: 0,text
0,Last session of the day http://twitpic.com/67ezh
1,Shanghai is also really exciting (precisely -...
2,"Recession hit Veronique Branquinho, she has to..."
3,happy bday!
4,http://twitpic.com/4w75p - I like it!!


In [21]:
import re
def remove_links(text):
    # Regular expression to find URLs
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    return re.sub(url_pattern, '', text)

x = x.apply(remove_links)

In [22]:
x.head()

Unnamed: 0,text
0,Last session of the day
1,Shanghai is also really exciting precisely sk...
2,Recession hit Veronique Branquinho she has to ...
3,happy bday
4,I like it


In [23]:
def remove_non_alpha(text):
    # Use regular expression to replace non-alphabetical characters with a space
    return re.sub(r'[^a-zA-Z\s]', '', text)

def strip_whitespaces(text):
    return text.strip()  # Remove leading and trailing whitespaces

x = x.apply(remove_non_alpha)
x = x.apply(strip_whitespaces)

In [24]:
x.head()

Unnamed: 0,text
0,Last session of the day
1,Shanghai is also really exciting precisely sk...
2,Recession hit Veronique Branquinho she has to ...
3,happy bday
4,I like it


In [25]:
# # Adding additional data
# df1 = pd.read_csv("/content/extra_data.csv")
# df1['sentiment'] = df1['sentiment'].map(sentiment_mapping)
# x_new = df1['text']
# y_new = df1['sentiment']

In [26]:
# x = pd.concat([x,x_new])
# y = pd.concat([y,y_new])

In [27]:
x.shape

(3534,)

In [28]:
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(x)
sequences = tokenizer.texts_to_sequences(x)

# Padding sequences
max_length = max(len(X) for X in sequences)
padded_sequences = pad_sequences(sequences, padding='post', maxlen=max_length)

In [29]:
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 50

In [30]:
# defining Attention Layer
class AttentionLayer(Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(shape=(input_shape[-1], input_shape[-1]),
                                 initializer='glorot_uniform',
                                 trainable=True)
        self.b = self.add_weight(shape=(input_shape[-1],),
                                 initializer='zeros',
                                 trainable=True)
        self.u = self.add_weight(shape=(input_shape[-1], 1),
                                 initializer='glorot_uniform',
                                 trainable=True)
        super(AttentionLayer, self).build(input_shape)

    def call(self, inputs):
        # Score computation
        u_it = K.tanh(K.dot(inputs, self.W) + self.b)
        ait = K.softmax(K.dot(u_it, self.u), axis=1)
        # Weighted sum of input vectors
        output = inputs * ait
        return K.sum(output, axis=1)

Training new embeddings

In [31]:
max_words = max_length  # Max words in each sentence
vocab_size = 10000      # Vocabulary size
embedding_dim = 50      # Embedding dimension

# Input layer for sentences
input_sentence = Input(shape=(max_words,))
embedded_sentence = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(input_sentence)

# Bi-directional GRU for word encoding
sentence_encoded = Bidirectional(GRU(50, return_sequences=True))(embedded_sentence)

# Word-level attention
sentence_attended = AttentionLayer()(sentence_encoded)

# Output layer for sentiment classification
output = Dense(3, activation='softmax')(sentence_attended)  # Three classes: positive, negative, neutral

# Define and compile the model
model = Model(input_sentence, output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [32]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(padded_sequences, y, test_size=0.2, random_state=42)

#Fit the model
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=6, batch_size=4)

Epoch 1/6
[1m707/707[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 33ms/step - accuracy: 0.4452 - loss: 1.0390 - val_accuracy: 0.6238 - val_loss: 0.8243
Epoch 2/6
[1m707/707[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 32ms/step - accuracy: 0.7617 - loss: 0.5826 - val_accuracy: 0.6436 - val_loss: 0.8553
Epoch 3/6
[1m707/707[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 32ms/step - accuracy: 0.8967 - loss: 0.3041 - val_accuracy: 0.6393 - val_loss: 1.1151
Epoch 4/6
[1m707/707[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 33ms/step - accuracy: 0.9516 - loss: 0.1373 - val_accuracy: 0.6082 - val_loss: 1.2938
Epoch 5/6
[1m707/707[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 33ms/step - accuracy: 0.9835 - loss: 0.0626 - val_accuracy: 0.6124 - val_loss: 1.5164
Epoch 6/6
[1m707/707[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 31ms/step - accuracy: 0.9946 - loss: 0.0267 - val_accuracy: 0.5941 - val_loss: 1.9913


<keras.src.callbacks.history.History at 0x7e76dafc1510>

In [33]:
# For Making Predictions on New Data
def preprocess_sentence(sentence):
    sentence = remove_links(sentence)
    sentence = remove_non_alpha(sentence)
    sentence = strip_whitespaces(sentence)
    sequence = tokenizer.texts_to_sequences([sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_words, padding='post', truncating='post')
    return padded_sequence

def predict_sentiment(sentence, label_mapping):
    # Preprocess the sentence
    processed_sentence = preprocess_sentence(sentence)
    # Get the model's prediction
    prediction = model.predict(processed_sentence)
    # Get the index of the max prediction score
    predicted_class = np.argmax(prediction, axis=-1)[0]
    # Map the index back to the label
    sentiment = {v: k for k, v in label_mapping.items()}[predicted_class]
    return sentiment

# Example Usage
sentence = "this video is bad"
label_mapping = {"positive": 1, "negative": 0, "neutral": 2}  # Same as training
sentiment = predict_sentiment(sentence, label_mapping)

print(f"The sentiment of the sentence '{sentence}' is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 458ms/step
The sentiment of the sentence 'this video is bad' is: negative


Other Model

In [34]:
# Neural Network Model

# model = Sequential([
#     Embedding(input_dim=vocab_size, output_dim=embedding_dim),
#     LSTM(64, return_sequences=False),
#     Dense(32, activation='relu'),
#     Dense(3, activation='softmax')  # Output layer for three classes: positive, negative, neutral
# ])

# model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [35]:
# embeddings = model.layers[0](X)  # => Obtaining Embeddings
# features = tf.keras.backend.eval(embeddings)
# X_flat = np.mean(features, axis=1)  => Global Average Pooling
# # X_flat = features.reshape(X.shape[0], -1)

In [36]:
# Applying Random Forest for Robustness and Prevent Overfitting

# X_train, X_test, y_train, y_test = train_test_split(X_flat, y, test_size=0.2, random_state=42)
# rf_model = RandomForestClassifier()
# rf_model.fit(X_train, y_train)

In [37]:
# def predict_sentiment(new_sentence):
#     new_sentence = remove_non_alpha(new_sentence)
#     new_sentence = strip_whitespaces(new_sentence)

#     sequence = tokenizer.texts_to_sequences([new_sentence])
#     # Pad the sequence
#     padded_sequence = pad_sequences(sequence, padding='post', maxlen=max_length)

#     # Getting embeddings from the RNN model
#     embeddings = model.layers[0](padded_sequence)
#     embeddings_np = tf.keras.backend.eval(embeddings)  # Convert to numpy array
#     embeddings_np1 = np.mean(embeddings_np, axis=1)

#     # Using the Random Forest model for prediction
#     prediction = rf_model.predict(embeddings_np1)

#     return prediction

In [38]:
# new_sentence = "It is not bad"
# predicted_label = predict_sentiment(new_sentence)

Storing Models Params and Tokenizer

In [39]:
import joblib
joblib.dump(model, 'model_1.joblib')
# model.save('model_1.h5')

['model_1.joblib']

In [40]:
import pickle
with open('tokenizer.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)