In [1]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
import pandas as pd

In [8]:
from sklearn.model_selection import train_test_split

# Load and shuffle data

In [2]:
file_path = "data.csv"
data = pd.read_csv(file_path, encoding='ISO-8859-1')
df_shuffled = data.sample(frac=1).reset_index(drop=True)

In [3]:
# print len of data
print("Total number of sentences: ", len(df_shuffled))

Total number of sentences:  1599999


# Set parameters

In [4]:
VOCAB_SIZE = 10000
MAX_LEN = 250
EMBEDDING_DIM = 16

# Preprocess Data

In [5]:
texts = []
labels = []

for _, row in df_shuffled.iterrows():
    texts.append(row[-1])
    label = row[0]
    labels.append(0 if label == 0 else 1 if label == 2 else 2)

texts = np.array(texts)
labels = np.array(labels)

# Tokenize

In [6]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=MAX_LEN, value=VOCAB_SIZE-1, padding='post')

In [7]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Train Test Split

In [9]:
# Split the data into training and test sets (5% for testing)
train_data, test_data, train_labels, test_labels = train_test_split(
    padded_sequences, labels, test_size=0.05, random_state=42
)

# Train

In [10]:
model = Sequential([
        Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_LEN),
        GlobalAveragePooling1D(),
        Dense(16, activation='relu'),
        Dense(3, activation='softmax')
    ])

In [11]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [12]:
model.fit(train_data, train_labels, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1f31531f790>

# Evaluate

In [13]:
loss, accuracy = model.evaluate(test_data, test_labels)
print(f"Test accuracy: {accuracy * 100:.2f}%")

Test accuracy: 80.18%


# Save

In [14]:
model.save('sentiment_analysis_model.h5')

# Predict

In [27]:
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [33]:
def encode_tex_with_loaded_tokenizer(text, tokenizer):
    tokens = tokenizer.texts_to_sequences([text])
    return pad_sequences(tokens, maxlen=MAX_LEN, padding='post', value=VOCAB_SIZE-1)

user_input = input("")
encoded_input = encode_tex_with_loaded_tokenizer(user_input, tokenizer)
prediction = np.argmax(model.predict(encoded_input))
if prediction == 0:
    print("Sentiment: Negative")
elif prediction == 1:
    print("Sentiment: Neutral")
else:
    print("Sentiment: Positive")

Sentiment: Positive
