In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pip install keras_preprocessing

Mounted at /content/drive
Collecting keras_preprocessing
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: keras_preprocessing
Successfully installed keras_preprocessing-1.1.2


#\#Top three models for sentiment analysis are :


1.   CNN Models
2.   RNN Models
3.   LSTM-based Models


#Imports and pre-processing

In [None]:
import pandas as pd
import numpy as np
import bz2
import os
import re
import gc

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.utils import pad_sequences

from tensorflow.keras import models, layers, optimizers

from sklearn.model_selection import train_test_split

In [None]:
def assign_labels_and_comments(file):
    labels = []
    comments = []
    i=0
    for line in bz2.BZ2File(file):
        i+=1
        if i>20000:
          break
        x = line.decode("utf-8")
        labels.append(int(x[9]) - 1)
        comments.append(x[10:].strip())
    return np.array(labels), comments

In [None]:
train_labels, train_comments = assign_labels_and_comments('/content/drive/MyDrive/Intel SIP/train.ft.txt.bz2')
test_labels, test_comments = assign_labels_and_comments('/content/drive/MyDrive/Intel SIP/test.ft.txt.bz2')
not_alphanumeric = re.compile(r'[\W]')
not_ascii = re.compile(r'[^a-z0-1\s]')
def processed_comments(texts):
    processed_comments = []
    for text in texts:
        lower = text.lower()
        no_punctuation = not_alphanumeric.sub(r' ', lower)
        no_non_ascii = not_ascii.sub(r'', no_punctuation)
        processed_comments.append(no_non_ascii)
    return processed_comments
train_comments = processed_comments(train_comments)
test_comments = processed_comments(test_comments)
train_comments, val_comments, train_labels, val_labels = train_test_split(train_comments, train_labels, random_state=42, test_size=0.2)
maximum_features = 14000
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=maximum_features)
tokenizer.fit_on_texts(train_comments)
train_comments = tokenizer.texts_to_sequences(train_comments)
val_comments = tokenizer.texts_to_sequences(val_comments)
test_comments = tokenizer.texts_to_sequences(test_comments)
maximum_length = max(len(train_ex) for train_ex in train_comments)
train_comments_pad = tf.keras.preprocessing.sequence.pad_sequences(train_comments, maxlen=maximum_length)
val_comments_pad = tf.keras.preprocessing.sequence.pad_sequences(val_comments, maxlen=maximum_length)
test_comments_pad = tf.keras.preprocessing.sequence.pad_sequences(test_comments, maxlen=maximum_length)
del train_comments, val_comments, test_comments

# 1. CNN Models



The CNN (Convolutional Neural Network) model for sentiment analysis is a deep learning approach that uses a neural network to classify the sentiment of text data. This model is particularly effective for analyzing large datasets, as it can learn to recognize patterns and relationships within the data.

In this model, the text data is first preprocessed and converted into numerical vectors. These vectors are then fed into a convolutional layer, which applies filters to the input data to extract features that are relevant to the sentiment analysis task. The output of the convolutional layer is then passed through a pooling layer, which reduces the dimensionality of the data and helps to prevent overfitting.



## Evaluation metric
F1 score: The F1 score combines precision and recall into a single metric, providing a balanced measure of model performance. It is especially useful when there is an imbalance between the classes.

## Computation time
Computation is usually the least of the 3 methods.


In [None]:
def cnn_model():
    sequences = layers.Input(shape=(maximum_length,))
    embedded = layers.Embedding(maximum_features, 64)(sequences)
    x = layers.Conv1D(64, 3, activation='relu')(embedded)
    x = tf.keras.layers.BatchNormalization()(x)
    x = layers.MaxPool1D(3)(x)
    x = layers.Conv1D(64, 5, activation='relu')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = layers.MaxPool1D(5)(x)
    x = layers.Conv1D(64, 5, activation='relu')(x)
    x = layers.GlobalMaxPool1D()(x)
    x = layers.Flatten()(x)
    x = layers.Dense(100, activation='relu')(x)
    predictions = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs=sequences, outputs=predictions)
    model.compile(
        optimizer='rmsprop',
        loss='binary_crossentropy',
        metrics=['binary_accuracy']
    )
    model.summary()
    return model

model = cnn_model()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 203)]             0         
                                                                 
 embedding (Embedding)       (None, 203, 64)           896000    
                                                                 
 conv1d (Conv1D)             (None, 201, 64)           12352     
                                                                 
 batch_normalization (Batch  (None, 201, 64)           256       
 Normalization)                                                  
                                                                 
 max_pooling1d (MaxPooling1  (None, 67, 64)            0         
 D)                                                              
                                                                 
 conv1d_1 (Conv1D)           (None, 63, 64)            20544 

In [None]:
model.fit(
    train_comments_pad,
    train_labels,
    batch_size=512,
    epochs=3,
    validation_data=(val_comments_pad, val_labels))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7da651198f40>

# 2. RNN Model
A Recurrent Neural Network (RNN) is a type of deep learning model that is well-suited for sequence-based tasks, such as sentiment analysis. In an RNN, the output from the previous step is fed back into the model as input for the current step, allowing the model to learn from previous inputs and make predictions based on context.

For sentiment analysis, an RNN can be trained on a dataset of text samples labeled with positive, negative, or neutral sentiment. The model learns to identify patterns in the text that are indicative of each sentiment class, and can then be used to predict the sentiment of new text samples.



## Evaluation metric
Cross-validation: Cross-validation is a technique used to assess the model's performance on multiple subsets of the data. It helps in estimating how well the model will generalize to unseen data.

## Computation time
Computation is usually in-line with the CNN model.


In [None]:
def rnn_model():
    sequences = layers.Input(shape=(maximum_length,))
    embedded = layers.Embedding(maximum_features, 64)(sequences)
    x = tf.compat.v1.keras.layers.CuDNNGRU(128, return_sequences=True)(embedded)
    x = tf.compat.v1.keras.layers.CuDNNGRU(128)(x)
    x = layers.Dense(32, activation='relu')(x)
    x = layers.Dense(100, activation='relu')(x)
    predictions = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs=sequences, outputs=predictions)
    model.compile(
        optimizer='rmsprop',
        loss='binary_crossentropy',
        metrics=['binary_accuracy']
    )
    model.summary()
    return model

rnn_model = rnn_model()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 203)]             0         
                                                                 
 embedding_1 (Embedding)     (None, 203, 64)           896000    
                                                                 
 cu_dnngru (CuDNNGRU)        (None, 203, 128)          74496     
                                                                 
 cu_dnngru_1 (CuDNNGRU)      (None, 128)               99072     
                                                                 
 dense_2 (Dense)             (None, 32)                4128      
                                                                 
 dense_3 (Dense)             (None, 100)               3300      
                                                                 
 dense_4 (Dense)             (None, 1)                 101 

In [None]:
rnn_model.fit(train_comments_pad,
    train_labels,
    batch_size=512,
    epochs=3,
    validation_data=(val_comments_pad, val_labels))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7da6519ff460>

#Lstm-preprocessing

In [None]:
MAX_NB_WORDS = 10000
MAX_SEQUENCE_LENGTH = 250
EMBEDDING_DIM = 100

In [None]:
def get_labels_and_texts(file):
    labels = []
    texts = []
    i = 0
    for line in bz2.BZ2File(file):
        i+=1
        if i>20000:
          break
        x = line.decode("utf-8")
        labels.append(int(x[9]) - 1)
        texts.append(x[10:].strip())
    labels = labels[:int(len(labels)*0.01)]
    texts = texts[:int(len(texts)*0.01)]
    return np.array(labels), texts
train_labels, train_texts = get_labels_and_texts('/content/drive/MyDrive/Intel SIP/train.ft.txt.bz2')
test_labels, test_texts = get_labels_and_texts('/content/drive/MyDrive/Intel SIP/test.ft.txt.bz2')
train_df=pd.DataFrame(zip(train_texts,train_labels),columns=['text','label'])
test_df=pd.DataFrame(zip(test_texts,test_labels),columns=['text','label'])
import regex as re
import spacy
nlp = spacy.load('en_core_web_sm')
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem.regexp import RegexpStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
def remove_special_characters(text):
  text=text.str.lower()
  text=text.apply(lambda x: re.sub(r'[0-9]+','',x))
  text=text.apply(lambda x: re.sub(r'@mention',' ',x))
  text=text.apply(lambda x: re.sub(r'https?:\/\/\S+', ' ',x))
  text=text.apply(lambda x: re.sub(r"www.\[a-z]?\.?(com)+|[a-z]+\.(com)", ' ',x))
  text=text.apply(lambda x: re.sub(r"[_\,\>\(\-:\)\\\/\!\.\^\!\:\];='#]",'',x))
  return text
train_df['text']=remove_special_characters(train_df['text'])
test_df['text']=remove_special_characters(test_df['text'])
from keras.preprocessing import text,sequence


tokenizer = text.Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(train_df['text'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
from keras_preprocessing.sequence import pad_sequences

train_text = tokenizer.texts_to_sequences(train_df['text'].values)
train_text = pad_sequences(train_text, maxlen=MAX_SEQUENCE_LENGTH)

y = pd.get_dummies(train_df['label']).values
X_train, X_test, Y_train, Y_test = train_test_split(train_text,y, test_size = 0.10, random_state = 42)

Found 3447 unique tokens.


# 3. LSTM-based Model
The main advantage of LSTM-based semantic analysis is its ability to capture long-term dependencies in the text. Unlike traditional Bag of Words models, LSTMs can take into account the sequence of words and capture the context and meaning of the text. This allows for more accurate analysis of the text and better performance in tasks that require understanding the meaning of the text.



## Evaluation metric
Confusion matrix: A confusion matrix provides a detailed breakdown of the model's predictions, showing the number of true positives, true negatives, false positives, and false negatives. It helps in understanding the specific types of errors made by the model.

## Computation time
Computation is usually the most of the 3 methods.


In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM,Dropout,SpatialDropout1D,GlobalMaxPooling1D, Dense
import tensorflow as tf

model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=train_text.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(2, activation='softmax'))


model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 250, 100)          1000000   
                                                                 
 spatial_dropout1d (Spatial  (None, 250, 100)          0         
 Dropout1D)                                                      
                                                                 
 lstm (LSTM)                 (None, 100)               80400     
                                                                 
 dense_5 (Dense)             (None, 128)               12928     
                                                                 
 dense_6 (Dense)             (None, 2)                 258       
                                                                 
Total params: 1093586 (4.17 MB)
Trainable params: 1093586 (4.17 MB)
Non-trainable params: 0 (0.00 Byte)
__________________

In [None]:
epochs = 3
batch_size = 128

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


# Conclusion:


### LSTM (Long Short-Term Memory) based model would be preferred in production for sentiment analysis compared to CNN (Convolutional Neural Network) and traditional RNN models. The reasons being mentioned below:

1. ***Capturing Long-Term Dependencies***: LSTM models are designed to capture long-term dependencies in sequential data. Sentences can have complex structures and dependencies between words that extend over long distances. LSTMs with their memory cells and gates can effectively capture and remember these dependencies, making them better suited for sentiment analysis tasks.

2. ***Handling Variable-Length Sequences***: Sentences in sentiment analysis can vary in length. LSTM models can handle variable-length sequences by processing the input step-by-step, dynamically adjusting their internal state based on the input at each time step. This flexibility makes LSTMs more suitable for sentiment analysis tasks where the length of input text can vary.

3. ***Dealing with Contextual Information***: Sentiment analysis often requires understanding the context and meaning of words within a sentence. LSTMs excel at capturing contextual information as they maintain an internal memory state that can retain relevant information from earlier parts of the sentence. This allows LSTMs to better understand the sentiment expressed in the text.
