<a href="https://colab.research.google.com/github/Dreadnought73/AI_projects/blob/main/Sentiment_analysis_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Training ML models for sentiment analysis

In [None]:
import nltk
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import joblib
import tensorflow as tf
import keras
from keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import to_categorical

Setting a function for preprocessing text: all lower, no special character, tokenize, used in its original form

In [None]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
  text = text.lower()
  text = re.sub(r'[^a-z\s]', '', text) # Remove special characters, numbers, and punctuation
  tokens = word_tokenize(text)
  tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words] # this changes the word to their root
  return ' '.join(tokens)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
def load_and_preprocess_data(file_path):
  df = pd.read_csv(file_path)
  df['preprocessed_text'] = df['text'].apply(preprocess_text)
  return df

With the help of a Pipeline, defining a function that uses a certain model to train the dataset after it was vectorized.

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

def train_model(model_type, X_train, y_train):
  """
  The function will run for 3 classifier model: Random Forest, Logistic Regression, Naive Bayes

  Args:
  model_type: model abbreviation to decide which one to use
  X_train: training data
  y_train: training labels

  Return:
  pipeline: the trained model
  """
  if model_type == 'rf':
    model = RandomForestClassifier(n_estimators=50, random_state=42)
  elif model_type == 'lr':
    model = LogisticRegression(max_iter=500, C=1.0)
  elif model_type == 'nb':
    model = MultinomialNB()
  else:
    raise ValueError("Invalid model type. Choose from 'rf', 'lr', 'nb'.")

  # Build the pipeline
  pipeline = Pipeline([
      ('tf', TfidfVectorizer(max_features=5000, ngram_range=(1,2))), # max_features: limits down to the 5000 most frequent words; ngram_range: what groups of words should it take into account (1 to 2)
      # ('lr', LogisticRegression(max_iter=500, C=1.0)) # C: regularization step to prevent overfitting
      ('model_type', model)
  ])
  # fit the pipeline
  pipeline.fit(X_train, y_train)
  return pipeline

In [None]:
def evaluate_model(pipeline, X_test, y_test):
  y_pred = pipeline.predict(X_test)

  print("Classification Report:")
  print(classification_report(y_test, y_pred, target_names=['positive', 'negative']))

  print("Confusion matrix:")
  print(confusion_matrix(y_test, y_pred))

This function is useful to predict completely new sentences.

In [None]:
def predict_sentiment(pipeline, sentences):
  preprocessed_text = [preprocess_text(sentence) for sentence in sentences]
  predictions = pipeline.predict(preprocessed_text)

  labels = {0:'negative', 1:'positive'}
  return [labels[pred] for pred in predictions]

In [None]:
model = {
    'svc': SVC(C=100, kernel='linear'),
    'rf': RandomForestClassifier(n_estimators=100, random_state=42),
    'nb': MultinomialNB(),
    'lr': LogisticRegression(max_iter=500, C=1.0)
}

# Sentiment analysis with Logistic Regression

Putting everything together for logistic regression!

In [None]:
def main():
  df = load_and_preprocess_data('/content/drive/MyDrive/Coding_data_files/IMDB_sentiment/Train.csv')

  X = df['preprocessed_text']
  y = df['label']
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

  pipeline = train_model('lr', X_train, y_train)
  evaluate_model(pipeline, X_test, y_test)

  example_sentences = [
      "This film was an absolute masterpiece from start to finish. The cinematography was breathtaking, capturing the essence of the story in every frame.",
      "Wow, what a ride! This action-packed thriller exceeded all my expectations.",
      "I was profoundly disappointed by this film. The plot was convoluted and nonsensical, jumping between storylines without any clear direction or purpose.",
      "This movie was a complete mess. The editing was jarring and confusing, making it hard to follow the narrative.",
      "Absolutely terrible. This film failed on almost every level.",
      "A heartwarming and charming independent film that truly captured my heart. The story was simple but told with such sincerity and warmth.",
      "A frustrating and ultimately pointless viewing experience. The film promised an intriguing premise but failed to deliver on any of its potential.",
      "Visually spectacular and incredibly imaginative! This sci-fi epic transported me to another world with its stunning visuals and intricate world-building.",
      "An unexpectedly brilliant comedy that had me laughing out loud from beginning to end. The script was incredibly clever and sharp, with hilarious gags and witty one-liners delivered perfectly by the talented cast.",
      "Utterly forgettable. This film blended into the background with its generic story, bland characters, and uninspired visuals.",
      "This sequel completely missed the mark. It failed to capture the magic of the original film, rehashing old plot points without adding anything new or interesting."
  ]

  prediction_sentence = predict_sentiment(pipeline, example_sentences)
  for sentence, sentiment in zip(example_sentences, prediction_sentence):
        print(f"Sentence: {sentence}")
        print(f"Predicted Sentiment: {sentiment}\n")


Result for the Logistic Regression model

In [None]:
if __name__ == "__main__":
    main()

Classification Report:
              precision    recall  f1-score   support

    positive       0.89      0.87      0.88      3966
    negative       0.88      0.89      0.89      4034

    accuracy                           0.88      8000
   macro avg       0.88      0.88      0.88      8000
weighted avg       0.88      0.88      0.88      8000

Confusion matrix:
[[3467  499]
 [ 426 3608]]
Sentence: This film was an absolute masterpiece from start to finish. The cinematography was breathtaking, capturing the essence of the story in every frame.
Predicted Sentiment: positive

Sentence: Wow, what a ride! This action-packed thriller exceeded all my expectations.
Predicted Sentiment: positive

Sentence: I was profoundly disappointed by this film. The plot was convoluted and nonsensical, jumping between storylines without any clear direction or purpose.
Predicted Sentiment: negative

Sentence: This movie was a complete mess. The editing was jarring and confusing, making it hard to follow 

We can see a relatively good performance on both the dataset and new sentences.

# Sentiment analysis with Random Forest classifier

Result for the Random Forest classifier model

In [None]:
def main_rf():
  df = load_and_preprocess_data('/content/drive/MyDrive/Coding_data_files/IMDB_sentiment/Train.csv')

  X = df['preprocessed_text']
  y = df['label']
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

  pipeline = train_model('rf', X_train, y_train)
  evaluate_model(pipeline, X_test, y_test)

  example_sentences = [
      "This film was an absolute masterpiece from start to finish. The cinematography was breathtaking, capturing the essence of the story in every frame.",
      "Wow, what a ride! This action-packed thriller exceeded all my expectations.",
      "I was profoundly disappointed by this film. The plot was convoluted and nonsensical, jumping between storylines without any clear direction or purpose.",
      "This movie was a complete mess. The editing was jarring and confusing, making it hard to follow the narrative.",
      "Absolutely terrible. This film failed on almost every level.",
      "A heartwarming and charming independent film that truly captured my heart. The story was simple but told with such sincerity and warmth.",
      "A frustrating and ultimately pointless viewing experience. The film promised an intriguing premise but failed to deliver on any of its potential.",
      "Visually spectacular and incredibly imaginative! This sci-fi epic transported me to another world with its stunning visuals and intricate world-building.",
      "An unexpectedly brilliant comedy that had me laughing out loud from beginning to end. The script was incredibly clever and sharp, with hilarious gags and witty one-liners delivered perfectly by the talented cast.",
      "Utterly forgettable. This film blended into the background with its generic story, bland characters, and uninspired visuals.",
      "This sequel completely missed the mark. It failed to capture the magic of the original film, rehashing old plot points without adding anything new or interesting."
  ]

  prediction_sentence = predict_sentiment(pipeline, example_sentences)
  for sentence, sentiment in zip(example_sentences, prediction_sentence):
        print(f"Sentence: {sentence}")
        print(f"Predicted Sentiment: {sentiment}\n")

In [None]:
main_rf()

Classification Report:
              precision    recall  f1-score   support

    positive       0.83      0.85      0.84      3966
    negative       0.85      0.83      0.84      4034

    accuracy                           0.84      8000
   macro avg       0.84      0.84      0.84      8000
weighted avg       0.84      0.84      0.84      8000

Confusion matrix:
[[3374  592]
 [ 701 3333]]
Sentence: This film was an absolute masterpiece from start to finish. The cinematography was breathtaking, capturing the essence of the story in every frame.
Predicted Sentiment: positive

Sentence: Wow, what a ride! This action-packed thriller exceeded all my expectations.
Predicted Sentiment: positive

Sentence: I was profoundly disappointed by this film. The plot was convoluted and nonsensical, jumping between storylines without any clear direction or purpose.
Predicted Sentiment: negative

Sentence: This movie was a complete mess. The editing was jarring and confusing, making it hard to follow 

# Sentiment analysis with Naive Bayes classifier

In [None]:
def main_nb():
  df = load_and_preprocess_data('/content/drive/MyDrive/Coding_data_files/IMDB_sentiment/Train.csv')

  X = df['preprocessed_text']
  y = df['label']
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

  pipeline = train_model('nb', X_train, y_train)
  evaluate_model(pipeline, X_test, y_test)

  example_sentences = [
      "This film was an absolute masterpiece from start to finish. The cinematography was breathtaking, capturing the essence of the story in every frame.",
      "Wow, what a ride! This action-packed thriller exceeded all my expectations.",
      "I was profoundly disappointed by this film. The plot was convoluted and nonsensical, jumping between storylines without any clear direction or purpose.",
      "This movie was a complete mess. The editing was jarring and confusing, making it hard to follow the narrative.",
      "Absolutely terrible. This film failed on almost every level.",
      "A heartwarming and charming independent film that truly captured my heart. The story was simple but told with such sincerity and warmth.",
      "A frustrating and ultimately pointless viewing experience. The film promised an intriguing premise but failed to deliver on any of its potential.",
      "Visually spectacular and incredibly imaginative! This sci-fi epic transported me to another world with its stunning visuals and intricate world-building.",
      "An unexpectedly brilliant comedy that had me laughing out loud from beginning to end. The script was incredibly clever and sharp, with hilarious gags and witty one-liners delivered perfectly by the talented cast.",
      "Utterly forgettable. This film blended into the background with its generic story, bland characters, and uninspired visuals.",
      "This sequel completely missed the mark. It failed to capture the magic of the original film, rehashing old plot points without adding anything new or interesting."
  ]

  prediction_sentence = predict_sentiment(pipeline, example_sentences)
  for sentence, sentiment in zip(example_sentences, prediction_sentence):
        print(f"Sentence: {sentence}")
        print(f"Predicted Sentiment: {sentiment}\n")

In [None]:
main_nb()

Classification Report:
              precision    recall  f1-score   support

    positive       0.86      0.84      0.85      3966
    negative       0.85      0.86      0.85      4034

    accuracy                           0.85      8000
   macro avg       0.85      0.85      0.85      8000
weighted avg       0.85      0.85      0.85      8000

Confusion matrix:
[[3329  637]
 [ 550 3484]]
Sentence: This film was an absolute masterpiece from start to finish. The cinematography was breathtaking, capturing the essence of the story in every frame.
Predicted Sentiment: positive

Sentence: Wow, what a ride! This action-packed thriller exceeded all my expectations.
Predicted Sentiment: positive

Sentence: I was profoundly disappointed by this film. The plot was convoluted and nonsensical, jumping between storylines without any clear direction or purpose.
Predicted Sentiment: negative

Sentence: This movie was a complete mess. The editing was jarring and confusing, making it hard to follow 

Overall, the results were relatively good, and they can be further optimized through hyperparameter optimization.