*Sentiment Analysis of online comments*

Step 1. Preprocess dataset

In [None]:
#install dependencies
%conda install pandas numpy tensorflow sklearn keras nltk

In [None]:
#import libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
import keras.api._v2.keras as keras
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
from nltk.tokenize import word_tokenize

#download packages from nltk
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
#read and format the csv in pandas
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1', names=["sentiment", "ids", "date", "flag", "user", "text"])

In [None]:
#removing unneeded columns
df.drop('ids', inplace=True, axis=1)
df.drop('date', inplace=True, axis=1)
df.drop('flag', inplace=True, axis=1)
df.drop('user', inplace=True, axis=1)

In [None]:
#cleaning text
def caydranisabum():
    df.dropna(subset=['text'], inplace=True) #drop missing values
    df.drop_duplicates(subset=['text'], inplace=True) #drop duplicates
    df['text'] = df['text'].str.lower().str.replace('[^\w\s]', '', regex=True) #convert all chars to lowercase

caydranisabum()


In [None]:
#tokenisation of tweets using nltk
df['tokens'] = df['text'].apply(word_tokenize)

In [None]:
#removing stopwords
stop_words = set(stopwords.words('english'))
df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])

In [None]:
#stemming
stemmer = PorterStemmer()
df['tokens'] = df['tokens'].apply(lambda x: [stemmer.stem(word) for word in x])

In [None]:
#un-tokenising the preprocessed text
df['cleaned_text'] = df['tokens'].apply(lambda x: ' '.join(x))

In [None]:
#remove unneeded columns
df.drop('text', inplace=True, axis=1)
df.drop('tokens', inplace=True, axis=1)

#the columns shld now be just 'sentiment' and 'cleaned-text'
df

In [None]:
#csv to numpy arr (tensorflow likes numpy arrays)
texts = df['cleaned_text'].values
labels = df['sentiment'].values

print('cleaned text',texts)
print('labels',labels)

Step 2: Set up our dataset & model for training

In [None]:
#convert to bin (tensorflow likes binary)
from keras.utils import to_categorical
labels = to_categorical(labels // 4, num_classes=2)

print(labels)

In [None]:
max_words = 20000  #num ceil
max_len = 100  #len ceil

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

In [None]:
#make len same with pad sequences
data = pad_sequences(sequences, maxlen=max_len)

In [None]:
#split data (for loss and accuracy also)
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

In [None]:
#setting up keras 2 model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=5),
    tf.keras.layers.LSTM(128),
    tf.keras.layers.Dense(2, activation='softmax')  # Change to 2 output units for 2 classes
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


Step 3: Training

In [None]:
#training (10 epochs)
batch_size = 32
epochs = 10

history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2)

In [None]:
#save the model as hdf5
model.save('sentiment_analysis_model_2.h5')

In [None]:
#val loss n accuracy
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')


Step 4: Using the model to predict sentiment

In [None]:
#function to use the model, outputs a boolean (negative=false, positive=true)
def predict_sentiment(sentence):
    #preprocess sentence
    sentence = sentence.lower()
    tokens = word_tokenize(sentence)
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    cleaned_sentence = ' '.join(tokens)
    
    sequence = tokenizer.texts_to_sequences([cleaned_sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_len)
     
    #predict
    prediction = model.predict(padded_sequence, verbose=0)
    sentiment_class = np.argmax(prediction)

    #return boolean value
    sentiment_map = {0: 0, 1: 4}
    if sentiment_map[sentiment_class] == 0:
        return False
    elif sentiment_map[sentiment_class] == 4:
        return True

In [None]:
#user input
tmp = input('Comment: ')

print(tmp)

#use above function to predict
tmp2 = predict_sentiment(tmp)


if tmp2 == True:
    print('This comment has a positive sentiment.')
elif tmp2 == False:
    print('This comment has a negative sentiment.')

print('')

if len(tmp.split()) < 10:
    print('Your sentence is ', len(tmp.split()), ' words, it is a short sentence. To get an accurate analysis, please make your sentence at least', 10-len(tmp.split()), ' words longer.')
else:
    print('Your sentence is ', len(tmp.split()), ' words long. The result should be accurate.')

Step 5: Use the saved model (no need for re-training)

In [None]:
#declare model to use as the saved hdf5 file (shld be in the same directory as this notebook)
save_model = tf.keras.models.load_model('sentiment_analysis_model.h5')

#summary of the model, just make sure the model is actl detected
save_model.summary()

In [None]:
#setup dependencies for save model
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

max_words = 20000  #num ceil
max_len = 100  #len ceil

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)




In [None]:
#this is the same function as the one above
def predict_sentiment_save(sentence):
    sentence = sentence.lower()
    tokens = word_tokenize(sentence)
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    cleaned_sentence = ' '.join(tokens)
    sequence = tokenizer.texts_to_sequences([cleaned_sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_len)
    prediction = save_model.predict(padded_sequence, verbose=0)
    sentiment_class = np.argmax(prediction)
    sentiment_map = {0: 0, 1: 4}
    if sentiment_map[sentiment_class] == 0:
        return False
    else:
        return True

In [None]:
#same input thing as the one above
tmp = input('Comment: ')

print('Your sentence: ', tmp)
print('')

tmp2 = predict_sentiment_save(tmp)


if tmp2 == True:
    print('This comment has a positive sentiment.')
elif tmp2 == False:
    print('This comment has a negative sentiment.')

print('')

if len(tmp.split()) < 10:
    print('Your sentence is ', len(tmp.split()), ' words, it is a short sentence. To get an accurate analysis, please make your sentence at least', 10-len(tmp.split()), ' words longer.')
else:
    print('Your sentence is ', len(tmp.split()), ' words long. The result should be accurate.')