*Sentiment Analysis of online comments*

In [1]:
%conda install pandas numpy tensorflow sklearn keras nltk

Channels:
 - defaults
 - conda-forge
Platform: osx-arm64
Collecting package metadata (repodata.json): done
Solving environment: failed

PackagesNotFoundError: The following packages are not available from current channels:

  - sklearn

Current channels:

  - defaults
  - https://conda.anaconda.org/conda-forge/noarch
  - https://conda.anaconda.org/conda-forge/osx-arm64

To search for alternate channels that may provide the conda package you're
looking for, navigate to

    https://anaconda.org

and use the search bar at the top of the page.



Note: you may need to restart the kernel to use updated packages.


In [2]:
#import libs
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
import keras.api._v2.keras as keras
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
from nltk.tokenize import word_tokenize

#packages from nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/coen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/coen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
#read and format the csv in pandas
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1', names=["sentiment", "ids", "date", "flag", "user", "text"])

In [4]:
#removing unneeded columns
df.drop('ids', inplace=True, axis=1)
df.drop('date', inplace=True, axis=1)
df.drop('flag', inplace=True, axis=1)
df.drop('user', inplace=True, axis=1)

#check to make sure only 'sentiments' and 'text' columns remain
df

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."
...,...,...
1599995,4,Just woke up. Having no school is the best fee...
1599996,4,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,Happy 38th Birthday to my boo of alll time!!! ...


In [5]:
#cleaning text
def caydranisabum():
    df.dropna(subset=['text'], inplace=True) #drop missing values
    df.drop_duplicates(subset=['text'], inplace=True) #drop duplicates
    df['text'] = df['text'].str.lower().str.replace('[^\w\s]', '', regex=True) #convert all chars to lowercase

caydranisabum()
df['text']


0          switchfoot httptwitpiccom2y1zl  awww thats a b...
1          is upset that he cant update his facebook by t...
2          kenichan i dived many times for the ball manag...
3            my whole body feels itchy and like its on fire 
4          nationwideclass no its not behaving at all im ...
                                 ...                        
1599995    just woke up having no school is the best feel...
1599996    thewdbcom  very cool to hear old walt intervie...
1599997    are you ready for your mojo makeover ask me fo...
1599998    happy 38th birthday to my boo of alll time tup...
1599999    happy charitytuesday thenspcc sparkscharity sp...
Name: text, Length: 1581466, dtype: object

In [6]:
#tokenisation of tweets using nltk
df['tokens'] = df['text'].apply(word_tokenize)

In [10]:
#removing stopwords
stop_words = set(stopwords.words('english'))
df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])

df['tokens']

0          [switchfoot, httptwitpiccom2y1zl, awww, thats,...
1          [upset, cant, update, facebook, texting, might...
2          [kenichan, dived, many, times, ball, managed, ...
3                    [whole, body, feels, itchy, like, fire]
4            [nationwideclass, behaving, im, mad, cant, see]
                                 ...                        
1599995                  [woke, school, best, feeling, ever]
1599996    [thewdbcom, cool, hear, old, walt, interviews,...
1599997                [ready, mojo, makeover, ask, details]
1599998    [happy, 38th, birthday, boo, alll, time, tupac...
1599999    [happy, charitytuesday, thenspcc, sparkscharit...
Name: tokens, Length: 1581466, dtype: object

In [11]:
#stemming
stemmer = PorterStemmer()
df['tokens'] = df['tokens'].apply(lambda x: [stemmer.stem(word) for word in x])

df['tokens']

0          [switchfoot, httptwitpiccom2y1zl, awww, that, ...
1          [upset, cant, updat, facebook, text, might, cr...
2          [kenichan, dive, mani, time, ball, manag, save...
3                     [whole, bodi, feel, itchi, like, fire]
4               [nationwideclass, behav, im, mad, cant, see]
                                 ...                        
1599995                     [woke, school, best, feel, ever]
1599996    [thewdbcom, cool, hear, old, walt, interview, ...
1599997                   [readi, mojo, makeov, ask, detail]
1599998    [happi, 38th, birthday, boo, alll, time, tupac...
1599999    [happi, charitytuesday, thenspcc, sparkschar, ...
Name: tokens, Length: 1581466, dtype: object

In [12]:
df['cleaned_text'] = df['tokens'].apply(lambda x: ' '.join(x))

df

Unnamed: 0,sentiment,text,tokens,cleaned_text
0,0,switchfoot httptwitpiccom2y1zl awww thats a b...,"[switchfoot, httptwitpiccom2y1zl, awww, that, ...",switchfoot httptwitpiccom2y1zl awww that bumme...
1,0,is upset that he cant update his facebook by t...,"[upset, cant, updat, facebook, text, might, cr...",upset cant updat facebook text might cri resul...
2,0,kenichan i dived many times for the ball manag...,"[kenichan, dive, mani, time, ball, manag, save...",kenichan dive mani time ball manag save 50 res...
3,0,my whole body feels itchy and like its on fire,"[whole, bodi, feel, itchi, like, fire]",whole bodi feel itchi like fire
4,0,nationwideclass no its not behaving at all im ...,"[nationwideclass, behav, im, mad, cant, see]",nationwideclass behav im mad cant see
...,...,...,...,...
1599995,4,just woke up having no school is the best feel...,"[woke, school, best, feel, ever]",woke school best feel ever
1599996,4,thewdbcom very cool to hear old walt intervie...,"[thewdbcom, cool, hear, old, walt, interview, ...",thewdbcom cool hear old walt interview â httpb...
1599997,4,are you ready for your mojo makeover ask me fo...,"[readi, mojo, makeov, ask, detail]",readi mojo makeov ask detail
1599998,4,happy 38th birthday to my boo of alll time tup...,"[happi, 38th, birthday, boo, alll, time, tupac...",happi 38th birthday boo alll time tupac amaru ...


In [13]:
#remove columns
df.drop('text', inplace=True, axis=1)
df.drop('tokens', inplace=True, axis=1)

df

Unnamed: 0,sentiment,cleaned_text
0,0,switchfoot httptwitpiccom2y1zl awww that bumme...
1,0,upset cant updat facebook text might cri resul...
2,0,kenichan dive mani time ball manag save 50 res...
3,0,whole bodi feel itchi like fire
4,0,nationwideclass behav im mad cant see
...,...,...
1599995,4,woke school best feel ever
1599996,4,thewdbcom cool hear old walt interview â httpb...
1599997,4,readi mojo makeov ask detail
1599998,4,happi 38th birthday boo alll time tupac amaru ...


In [16]:
#csv to numpy arr
texts = df['cleaned_text'].values
labels = df['sentiment'].values

print('cleaned text',texts)
print('labels',labels)

cleaned text ['switchfoot httptwitpiccom2y1zl awww that bummer shoulda got david carr third day'
 'upset cant updat facebook text might cri result school today also blah'
 'kenichan dive mani time ball manag save 50 rest go bound' ...
 'readi mojo makeov ask detail'
 'happi 38th birthday boo alll time tupac amaru shakur'
 'happi charitytuesday thenspcc sparkschar speakinguph4h']
labels [0 0 0 ... 4 4 4]


DATA IS READY

In [17]:
#convert to bin
from keras.utils import to_categorical
labels = to_categorical(labels // 4, num_classes=2)

print(labels)

[[1. 0.]
 [1. 0.]
 [1. 0.]
 ...
 [0. 1.]
 [0. 1.]
 [0. 1.]]


In [18]:
max_words = 20000  #num ceil
max_len = 100  #len ceil

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

In [None]:
#make len same
data = pad_sequences(sequences, maxlen=max_len)

In [None]:
#split data
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

In [None]:
#tf model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=5),
    tf.keras.layers.LSTM(128),
    tf.keras.layers.Dense(2, activation='softmax')  # Change to 2 output units for 2 classes
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
#training (10 epochs can increase)
batch_size = 32
epochs = 10

history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2)

In [None]:
model.save('sentiment_analysis_model_2.h5')

In [None]:
#loss n accuracy
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')


In [None]:
sentimentanalysis = tf.keras.models.load_model('sentiment_analysis_model.h5')

sentimentanalysis.summary()

In [None]:
def predict_sentiment(sentence):
    #preprocess sentence
    sentence = sentence.lower()
    tokens = word_tokenize(sentence)
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    cleaned_sentence = ' '.join(tokens)
    
    sequence = tokenizer.texts_to_sequences([cleaned_sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_len)
    
    #predict
    prediction = model.predict(padded_sequence, verbose=0)
    sentiment_class = np.argmax(prediction)

    #sent to label
    sentiment_map = {0: 0, 1: 4}
    if sentiment_map[sentiment_class] == 0:
        return False
    elif sentiment_map[sentiment_class] == 4:
        return True

In [None]:
tmp = input('Comment: ')

print(tmp)

tmp2 = predict_sentiment(tmp)


if tmp2 == True:
    print('This comment has a positive sentiment.')
elif tmp2 == False:
    print('This comment has a negative sentiment.')

print('')

if len(tmp.split()) < 10:
    print('Your sentence is ', len(tmp.split()), ' words, it is a short sentence. To get an accurate analysis, please make your sentence at least', 10-len(tmp.split()), ' words longer.')
else:
    print('Your sentence is ', len(tmp.split()), ' words long. The result should be accurate.')

save and use the saved h5 model

In [24]:
save_model = tf.keras.models.load_model('sentiment_analysis_model.h5')

save_model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 100, 128)          2560000   
                                                                 
 conv1d_2 (Conv1D)           (None, 96, 128)           82048     
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 19, 128)          0         
 1D)                                                             
                                                                 
 lstm_2 (LSTM)               (None, 128)               131584    
                                                                 
 dense_2 (Dense)             (None, 2)                 258       
                                                                 
Total params: 2,773,890
Trainable params: 2,773,890
Non-trainable params: 0
____________________________________________

In [14]:
def predict_sentiment_save(sentence):
    #preprocess sentence
    sentence = sentence.lower()
    tokens = word_tokenize(sentence)
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    cleaned_sentence = ' '.join(tokens)
    
    sequence = tokenizer.texts_to_sequences([cleaned_sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_len)
    
    #predict
    prediction = save_model.predict(padded_sequence, verbose=0)
    sentiment_class = np.argmax(prediction)

    #sent to label
    sentiment_map = {0: 0, 1: 4}
    if sentiment_map[sentiment_class] == 0:
        return False
    elif sentiment_map[sentiment_class] == 4:
        return True

In [26]:
tmp = input('Comment: ')

print(tmp)
print('')

tmp2 = predict_sentiment_save(tmp)


if tmp2 == True:
    print('This comment has a positive sentiment.')
elif tmp2 == False:
    print('This comment has a negative sentiment.')

print('')

if len(tmp.split()) < 10:
    print('Your sentence is ', len(tmp.split()), ' words, it is a short sentence. To get an accurate analysis, please make your sentence at least', 10-len(tmp.split()), ' words longer.')
else:
    print('Your sentence is ', len(tmp.split()), ' words long. The result should be accurate.')

rizz

This comment has a positive sentiment.

Your sentence is  1  words, it is a short sentence. To get an accurate analysis, please make your sentence at least 9  words longer.
