# Sentiment Analysis 2

## Table of Contents:
Sentiment Analysis of Movie Reviews:
1. EDA
2. Data Preprocessing
3. Feature Extraction
4. ML modelling

### Importing Libraries

In [None]:
# Linear algebra
import numpy as np
# EDA
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# NLTK libraries
import nltk
nltk.download('all')    # After running all, comment out this line to stop redownloading nltk every time
# Stopwords
from nltk.corpus import stopwords
# Stemmer & Lemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from nltk.stem import WordNetLemmatizer
# Wordcloud
from wordcloud import WordCloud,STOPWORDS
# Tokenizer
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
# RE
import re,string,unicodedata
# Bag of Words
from textblob import TextBlob
from textblob import Word
# Feature Extraction
from sklearn.model_selection import train_test_split, cross_val_score,StratifiedShuffleSplit
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
# ML models
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
# Metrics
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')
# Web Scraping tool
from bs4 import BeautifulSoup

### Importing dataset

In [None]:
# data = pd.read_csv('IMDB.csv')

In [None]:
# # Extend the dataframe display size
# pd.options.display.max_colwidth = 110

### Inspecting dataset

In [None]:
# data.head()

In [None]:
# data.shape

## 1. EDA

In [None]:
# # Summary of the dataset
# data.describe()

In [None]:
# # Counting sentiments
# data['sentiment'].value_counts()

## 2. Data Preprocessing


In [None]:
# # Quick visualisation of dataset (First 5 + Last 5 rows)
# data

In [None]:
# # Make a copy of the 'data' dataframe to work off from
# data_IMDB = data.copy()

### Tokenizing

In [None]:
# Initialize the tokenizer


# Setting English stopwords


In [None]:
# Example text to tokenize
text = "This is an example sentence for tokenization."

# Tokenize the text


# # Print the tokens
# print(tokens)

### Text Processor

In [None]:
# Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

# Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

# Removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern,'',text)
    return text

In [None]:
# Example of our 3 defined functions
html_text = "<p>This is <b>bold</b> and <i>italic</i>.</p>"



input_text = "This is [some text] with [multiple] sets of [square brackets]."



input_text = "Hello, @world! 123"



In [None]:
# Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = remove_special_characters(text)
    return text

# Apply function on review column


In [None]:
# data_IMDB['review']

In [None]:
# Lemmatizing the text
def simple_lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    text = ' '.join({lemmatizer.lemmatize(word) for word in text.split()})
    return text

# Apply function on review column


In [None]:
# Example of lemmatizing text using sample sentence
input_text = "I am running and eating. The cars are running fast."



### Initial Data-Preprocessing Verdict
Lemmatizer is not as accurate as we want it to be
- Sentences are garbled and in a mess

### Part-Of-Speech (POS) tagging
Implement Part-Of-Speech (POS) tagging to improve accuracy.

This helps the algorithm understand the grammatical structure and meaning of a text.

For example, consider the sentence: "The cat is sleeping on the mat."

POS tagging would assign the following tags:
- "The" - determiner (DT)
- "cat" - noun (NN)
- "is" - verb (VBZ)
- "sleeping" - verb (VBG)
- "on" - preposition (IN)
- "the" - determiner (DT)
- "mat" - noun (NN)

In [None]:
# from nltk import pos_tag

# # Download NLTK resources
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')

# # Function to process the pos_tag
# def get_wordnet_pos(tag):
#     if tag.startswith('N'):
#         return 'n'  # Noun
#     elif tag.startswith('V'):
#         return 'v'  # Verb
#     elif tag.startswith('R'):
#         return 'r'  # Adverb
#     elif tag.startswith('J'):
#         return 'a'  # Adjective
#     else:
#         return 'n'  # Default to noun for unknown or uncategorized words

# # Redefining the lemmatizer function
# def simple_lemmatize(text):
#     lemmatizer = WordNetLemmatizer()
#     tokens = word_tokenize(text)
#     pos_tags = pos_tag(tokens)
#     lemmatized_tokens = [lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag)) for word, tag in pos_tags]
#     lemmatized_text = ' '.join(lemmatized_tokens)
#     return lemmatized_text

In [None]:
# Test if the new function works as intended
input_text = "I am running and eating. The cars are running fast."



In [None]:
# # Can we apply the new function on review column?
# data_IMDB['review'] = data_IMDB['review'].apply(simple_lemmatize)
# # We are not going to do it this way; computationally expensive, time consuming

In [None]:
# # Set stopwords to English
# stop = set(stopwords.words('english'))
# print(stop)

# # Removing the stopwords
# def remove_stopwords(text, is_lower_case=False):
#     tokens = tokenizer.tokenize(text)
#     tokens = [token.strip() for token in tokens]
#     if is_lower_case:
#         filtered_tokens = [token for token in tokens if token not in stopword]
#     else:
#         filtered_tokens = [token for token in tokens if token.lower() not in stopword]
#     filtered_text = ' '.join(filtered_tokens)
#     return filtered_text

# # Example to test out our stopword-removing function
# input_text = "This is an example sentence with some stopwords."
# filtered_text = remove_stopwords(input_text, is_lower_case=True)
# print(filtered_text)

In [None]:
# Apply function on 'review' column


### Text Normalisation

In [None]:
# Set a variable for the normalized dataframe and add the data_IMDB



In [None]:
# norm_data_IMDB.shape

## 3. Feature Extraction

### Method 1: Bag of Words

The "Bag of Words" (BoW) model is a common and simple representation used in natural language processing (NLP) and information retrieval.

It's a way of converting text data into numerical vectors that can be used by machine learning algorithms

TLDR: Based on the raw word counts and is suitable when you want to capture the frequency of words in a document.

In [None]:
# Example documents in list form
documents = ["This is the first document.",
              "This document is the second document.",
              "And this is the third one.",
              "Is this the first document?"]

# Create an instance of the CountVectorizer class, where ngram ranges from 1 word to 3 words
# Unigram = singular word / Bigram = 2 words


# Fit and transform the documents into a Bag of Words representation


# Get the feature names (words) that correspond to the columns in the Bag of Words matrix


# Convert the Bag of Words matrix to an array for better visualization


# DataFrame for better visualization


# Display the DataFrame


In [None]:
# # Fitting our data into the CountVectorizer
# vect = CountVectorizer(ngram_range=(1,3)).fit(norm_data_IMDB['review'])

In [None]:
# Getting the feature names from the vectorised features


In [None]:
# feature_names

In [None]:
# norm_data_IMDB['review'].head()

In [None]:
# Extract the feature 'review'


In [None]:
# X_cv.shape

In [None]:
# Extract the target 'sentiment'


In [None]:
# Transforming the feature 'review' data


In [None]:
# X_cv.shape

#### Method 2: TF-IDF

Term Frequency (TF):

The TF component measures how often a term appears in a document. It's a raw count of the number of times the term occurs within the document.
TF is calculated for each term within each document.

Inverse Document Frequency (IDF):

The IDF component evaluates how important a term is across the entire corpus(enitre body of text). It's a measure of how unique or rare a term is.
Terms that appear frequently in many documents have a lower IDF, while terms that appear in a smaller subset of documents have a higher IDF.

TLDR: Considers not only the frequency of words but also their importance across the entire set of documents. It helps in emphasizing words that are more discriminative and less common across documents

In [None]:
# Create TFIDF vectorizer

In [None]:
# Apply TFIDF transformer to 'review' column


In [None]:
# tfidf.get_feature_names_out()

In [None]:
# print(X_tf.shape)

In [None]:
# Extract the target 'sentiment'


### Labelling the 'sentiment' text

In [None]:
# Setting up the LabelBinarizer


# # Transforming and Labelling the 'sentiment' data
# sentiment_data = lb.fit_transform(data_IMDB['sentiment'])
# print(sentiment_data.shape)

## 4. ML Modelling

### Model 1: Logistic Regression

#### Logistic Regression - Bags of Words Model

In [None]:
# Setting up the LogisticRegression model


In [None]:
# # Split arrays/matrices into random train and test subsets. In this case, 80:20 for Train:Test ratio
# x_train_cv, x_test_cv, y_train_cv, y_test_cv = train_test_split(X_cv, Y_cv, test_size=0.2, random_state=42)

In [None]:
# Fitting the lr model for Bag of Words



# Predicting the lr model for Bag of Words



#### Logistic Regression - TFIDF Model

In [None]:
# # Split arrays/matrices into random train and test subsets. In this case, 80:20 for Train:Test ratio
# x_train_tf, x_test_tf, y_train_tf, y_test_tf = train_test_split(X_tf, Y_tf, test_size=0.2, random_state=42)

In [None]:
# # Fitting the lr model for TFIDF features
# lr_tfidf = lr.fit(x_train_tf, y_train_tf)
# print(lr_tfidf)

# # Predicting the lr model for TFIDF features
# lr_tfidf_predict = lr.predict(x_test_tf)
# print(lr_tfidf_predict)

#### Logistic Regression - Accuracy Scores & Classification Report for both Models

In [None]:
# Accuracy score for Bag of Words



# Accuracy score for TFIDF features



In [None]:
# Classification report for Bag of Words



# Classification report for TFIDF features



#### Logistic Regression - Confusion Matrix for both Models

##### For Bags of Words Model

##### For TFIDF Model

In [None]:
# cm_tf = confusion_matrix(y_test_tf, lr_tfidf_predict, labels=lr.classes_)
# disp = ConfusionMatrixDisplay(confusion_matrix=cm_tf, display_labels=lr.classes_)
# disp.plot()

### Model 2: Multinomial Naive Bayes (MNB)

#### MNB - Bags of Words Model

In [None]:
# Training the Multinomial Naive Bayes model



# Fitting the MNB for Bag of Words



# Predicting the model for Bag of Words



#### MNB - TFIDF Model

In [None]:
# # Fitting the MNB for TFIDF features
# mnb_tfidf = mnb.fit(x_train_tf, y_train_tf)
# print(mnb_tfidf)

# # Predicting the MNB model for TFIDF features
# mnb_tfidf_predict = mnb.predict(x_test_tf)
# print(mnb_tfidf_predict)

#### MNB - Accuracy Scores for both Models

In [None]:
# # Accuracy score for Bag of Words
# mnb_bow_score = accuracy_score(y_test_cv, mnb_bow_predict)
# print('mnb_bow_score : {:.2f}%'.format(mnb_bow_score*100))

# # Accuracy score for TFIDF features
# mnb_tfidf_score = accuracy_score(y_test_tf, mnb_tfidf_predict)
# print('mnb_tfidf_score : {:.2f}%'.format(mnb_tfidf_score*100))

#### MNB - Confusion Matrix for both Models

##### For Bags of Words Model

In [None]:
# cm_cv_mnb = confusion_matrix(y_test_cv, mnb_bow_predict, labels=mnb.classes_)
# disp = ConfusionMatrixDisplay(confusion_matrix = cm_cv_mnb, display_labels = mnb.classes_)
# disp.plot()

##### For TFIDF Model

In [None]:
# cm_tf_mnb = confusion_matrix(y_test_tf, mnb_tfidf_predict, labels=mnb.classes_)
# disp = ConfusionMatrixDisplay(confusion_matrix = cm_tf_mnb, display_labels = mnb.classes_)
# disp.plot()