## Rotten Tomatoes Sentiment Analysis
- Beatriz Correia Paulino 
- Luís Pereira

In [1]:
# Necessary Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Baseline Model
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import stanza

# Pre Processing
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk import ne_chunk
from nltk.tree import Tree
from nltk import WordNetLemmatizer
import spacy
from nltk.corpus import stopwords
import regex as re
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('stopwords')

# Model Evaluation
from sklearn.metrics import classification_report

# Machine Learning
from collections import Counter #bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
import ast # to maintain lists stored in dfs
from ast import literal_eval

# Transformers
from tqdm import tqdm # progression bar
from transformers import pipeline
import datasets
from datasets import Dataset, load_metric
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer


# Generative Models
import openai


# Outros
from scipy.sparse import hstack


  from .autonotebook import tqdm as notebook_tqdm


### Loading Data

In [None]:
# Load the data
df_test = pd.read_csv('rotten_tomatoes_test.tsv', sep='\t', header=None)
df_train = pd.read_csv('rotten_tomatoes_train.tsv', sep='\t', header=None)

In [None]:
# First look into the train dataset
df_train

In [None]:
# First look into the test dataset
df_test

In [None]:
df_train.info()

In [None]:
df_test.info()

### Data Pre-Processing
- Change column names
- New Column: 'sentiment_numeric'

In [None]:
# Change column names
columns = ['id', 'sentiment', 'review']
df_train.columns = columns
df_test.columns = columns

In [None]:
# Check the changes
df_train.info()

In [None]:
# Mapping sentiment labels from positive, negative, neutral to 1, -1, 0

# Function to map sentiments to numeric values
def map_sentiment_to_numeric(sentiment):
    if sentiment == 'negative':
        return -1
    elif sentiment == 'neutral':
        return 0
    elif sentiment == 'positive':
        return 1

# Applying the mapping function to the 'sentiment' column
df_train['sentiment_numeric'] = df_train['sentiment'].apply(map_sentiment_to_numeric)
df_test['sentiment_numeric'] = df_test['sentiment'].apply(map_sentiment_to_numeric)

In [None]:
# Check the changes
df_train.head()

In [None]:
# Check how many of each label there is 
positive_count_test = df_test[df_test['sentiment'] == 'positive'].shape[0]
negative_count_test = df_test[df_test['sentiment'] == 'negative'].shape[0]
neutral_count_test = df_test[df_test['sentiment'] == 'neutral'].shape[0]

positive_count_train = df_train[df_train['sentiment'] == 'positive'].shape[0]
negative_count_train = df_train[df_train['sentiment'] == 'negative'].shape[0]
neutral_count_train = df_train[df_train['sentiment'] == 'neutral'].shape[0]

print("Test Data - Positive:", positive_count_test, "Negative:", negative_count_test, "Neutral:", neutral_count_test)
print("Train Data - Positive:", positive_count_train, "Negative:", negative_count_train, "Neutral:", neutral_count_train)

## Baseline Model
- VaderSentiment
- Stanza

### VaderSentiment Baseline Model
- Compound Score >= 0.05 -> Positive
- Compound Score <= -0.05 -> Negative
- Compund Score ]-.05, 0.05[ -> Neutral

In [None]:
# Initialize the VADER sentiment intensity analyzer
sia = SentimentIntensityAnalyzer()

In [None]:
# Function to convert VADER compound score to -1, 0, 1
def sentiment_score_to_label(compound_score):
    if compound_score <= -0.05:
        return -1  # Negative
    elif compound_score >= 0.05:
        return 1   # Positive
    else:
        return 0   # Neutral

In [None]:
# Applying VADER to predict sentiment scores and convert to labels
df_test['vader_predicted_sentiment'] = df_test.iloc[:, 2].apply(lambda x: sentiment_score_to_label(sia.polarity_scores(x)['compound']))
df_test['vader_compound_score'] = df_test.iloc[:, 2].apply(lambda x: sia.polarity_scores(x)['compound'])

In [None]:
# Check the new columns
df_test.head()

In [None]:
# Evaluate the VADER model
print(classification_report(df_test['sentiment_numeric'], df_test['vader_predicted_sentiment']))

### Stanza Approach

Stanza Models label sentiments with 0,1,2 which are changed by us to the follow:
- Negative (0) -> -1
- Neutral (1) -> 0
- Positive (2) -> 1

In [None]:
# This downloads the English models for the neural pipeline
#stanza.download('en')

In [None]:
# Initialize the pipeline
nlp = stanza.Pipeline(lang='en', processors='tokenize,sentiment')

# Function to predict sentiment using Stanza
def predict_sentiment_stanza(text):
    doc = nlp(text)
    if len(doc.sentences) > 0:  # Check if at least one sentence was found
        return doc.sentences[0].sentiment
    else:
        return None  # Return None or a default sentiqment if no sentences are detected


# Apply the stanza function to the test dataset
df_test['stanza_predicted_sentiment'] = df_test.iloc[:, 2].apply(predict_sentiment_stanza)

In [None]:
df_test.head()

In [None]:
# Convert Stanza sentiment numeric labels to -1, 0, 1
def stanza_sentiment_to_valder(stanza_sentiment):
    if stanza_sentiment == 2:
        return 1    # Positive
    elif stanza_sentiment == 0:
        return -1   # Negative
    else:
        return 0    #Neutral

# Update stanza predicted sentiment colum with converted values
df_test['stanza_predicted_sentiment'] = df_test['stanza_predicted_sentiment'].apply(stanza_sentiment_to_valder)

In [None]:
# Evaluate the STANZA model
print(classification_report(df_test['sentiment_numeric'], df_test['stanza_predicted_sentiment']))

## Reviews Pre-Processing
Functions for pre-processing techniques (by order of appearance):
- Lowercasing: lowercase_text()
- Named Entity Recognition: ner_text()
- Stemming: stem_text()
- POS Tagging: pos_tag_text()
- Expand Contraction: preprocess_text()
- Tokenize: tokenize_exception, tokenize_text()
- Stopwords: remove_stopword()
- Lemmatization: lemmatize_text()
- Negation Handling: handle_negations()

In [None]:
# Download en_core_web_sm
#!python -m spacy download en_core_web_sm

In [None]:
# Start Pipeline
nlp_spacy = spacy.load('en_core_web_sm')
nlp_stanza = stanza.Pipeline(lang='en', processors='tokenize, mwt, pos, lemma, ner')

In [None]:
# Function for lowercasing
def lowercase_text(text):
    # Process the text with SpaCy
    doc = nlp_spacy(text)
    # Lowercase the text except for words that are fully uppercased and longer than 1 character
    return ' '.join([token.text.lower() if not token.text.isupper() and len(token.text) > 1 else token.text for token in doc])

In [None]:
# Function for Named Entity Recognition
def ner(text):
    # Run the NER pipeline on the text with stanza
    doc = nlp_stanza(text)
    # Extract entities from the document
    entities = [(ent.text, ent.type) for sent in doc.sentences for ent in sent.ents]
    return entities

In [None]:
# Function that performs stemming on the text
def stem_text(text_or_words):
    ps = PorterStemmer()
    # Check if the input is a string and tokenize, otherwise use directly if it's a list
    words = word_tokenize(text_or_words) if isinstance(text_or_words, str) else text_or_words
    stemmed_words = [ps.stem(word) for word in words]
    return ' '.join(stemmed_words)

In [None]:
# Function that performs POS tagging using SpaCy, Stanza or NLTK
def pos_tag_text(text, library):
    assert library in ['spacy', 'stanza', 'nltk'], 'Library should be one of the following: spacy, stanza, or nltk'
    if library == 'spacy':
        doc = nlp_spacy(text)
        pos_tag_list = [(token.text, token.pos_) for token in doc]
        return pos_tag_list

    elif library == 'stanza':
        doc = nlp_stanza(text)
        pos_tag_list = [(word.text, word.upos) for sent in doc.sentences for word in sent.words]
        return pos_tag_list

    elif library == 'nltk':
        pos_tag_list = nltk.pos_tag(word_tokenize(text))
        return pos_tag_list

    else:
        return 'no'


In [None]:
# Function that adjusts the placement of "'s" using regex by removing unnecessary spaces before it.
def preprocess_text(text):
    # Define contraction patterns with correct groupings
    regex_patterns = {
        "possessive": r"(\b\w+)\s+('s)\b",  # For 's
        "not": r"(\b\w+)\s+(n't)\b",        # For n't
        "are": r"(\b\w+)\s+('re)\b",        # For 're
        "have": r"(\b\w+)\s+('ve)\b",       # For 've
        "will": r"(\b\w+)\s+('ll)\b",       # For 'll
        "modal": r"(\b\w+)\s+('d)\b",       # For 'd
        "am": r"(\b\w+)\s+('m)\b",          # For 'm
        "them": r"(\b\w+)\s+('em)\b",       # For 'em
        "old": r"(\b\w+)\s+('ol)\b"         # For 'ol
    }

    # Apply contraction fixes
    for key, pattern in regex_patterns.items():
        text = re.sub(pattern, r"\1\2", text)  # Using \1\2 now for correct replacement

    return text

In [None]:
# Function that expands contractions using regex so the tokenizations doesnt split those words.

def tokenize_exception(pos_list, token_list):
    # Pre-compile regular expressions
    regex_patterns = {
        "possessive": re.compile(r"'s$"),
        "not": re.compile(r"n't$"),
        "are": re.compile(r"'re$"),
        "have": re.compile(r"'ve$"),
        "will": re.compile(r"'ll$"),
        "modal": re.compile(r"'d$"),
        "am": re.compile(r"'m$"),
        "them": re.compile(r"'em$"),
        "old": re.compile(r"'ol$")
    }

    # Iterate over token_list to find and adjust tokens
    for i in range(len(token_list)):
        for key, pattern in regex_patterns.items():
            if pattern.search(token_list[i]):
                if key == "possessive":
                    if i + 1 < len(pos_list) and pos_list[i + 1][1] in ['JJ', 'IN', 'VBG', 'RB', 'RBR', 'RBS', 'VBN', 'DT', 'PART']:
                        token_list[i] = pattern.sub("is", token_list[i])
                    else:
                        token_list[i] = pattern.sub("", token_list[i])
                elif key == "modal":
                    if i + 1 < len(pos_list):
                        if 'VB' in pos_list[i + 1][1]:
                            token_list[i] = pattern.sub("would", token_list[i])
                        elif 'VBN' in pos_list[i + 1][1]:
                            token_list[i] = pattern.sub("had", token_list[i])
                else:
                    replacement = {"not": "not", "are": "are", "have": "have", "will": "will", "am": "am", "them": "them", "old": "old"}
                    token_list[i] = pattern.sub(replacement[key], token_list[i])
    return token_list



# Function that performs Tokenization using SpaCy, Stanza or NLTK
def tokenize_text(text, library, pos_list):
    assert library in ['spacy', 'stanza', 'nltk']
    if library == 'spacy':
        doc = nlp_spacy(text)
        token_list = tokenize_exception(pos_list, [token.text for token in doc])

        return token_list

    elif library == 'stanza':
        doc = nlp_stanza(text)
        token_list = tokenize_exception(pos_list, [word.text for sent in doc.sentences for word in sent.words])
        return token_list

    elif library == 'nltk':
        token_list = tokenize_exception(pos_list, word_tokenize(text))
        return token_list

    else:
        return text

In [None]:
# Function that removes stopwords using SpaCy, Stanza or NLTK
def remove_stopwords(tokens, library):
    assert library in ['spacy', 'stanza', 'nltk'], 'Library should be one of the following: spacy, stanza, or nltk'

    # Remove stopwords using spaCy
    if library == 'spacy':
        # Presuming 'nlp_spacy' has been previously initialized with a model and its stopwords loaded
        filtered_tokens = [token for token in tokens if token not in nlp_spacy.Defaults.stop_words]
        return filtered_tokens

    # Remove stopwords using Stanza
    elif library == 'stanza':
        # Presuming 'nlp_stanza' has been previously initialized and its stopwords are accessible
        filtered_tokens = [token for token in tokens if token not in nlp_stanza.Defaults.stop_words]
        return filtered_tokens

    # Remove stopwords using NLTK
    elif library == 'nltk':
        stopwords = nltk.corpus.stopwords.words('english')
        filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
        return filtered_tokens

    else:
        return tokens

In [None]:
# Function for lemmatization using SpaCy, Stanza or NLTK
def lemmatize_text(tokens, library):
    assert library in ['spacy', 'stanza', 'nltk'], 'Library should be one of the following: spacy, stanza, or nltk'

    # Lemmatization using spaCy
    if library == 'spacy':
        tokens = nlp_spacy(tokens)
        lemmatized_tokens = [token.lemma_ for token in tokens]
        return lemmatized_tokens

    # Lemmatization using Stanza
    elif library == 'stanza':
        tokens = nlp_stanza(tokens)
        lemmatized_tokens = [word.lemma for sent in tokens.sentences for word in sent.words]
        return lemmatized_tokens

    # Lemmatization using NLTK
    elif library == 'nltk':
        tokens = word_tokenize(tokens)
        lemmatizer = WordNetLemmatizer()
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
        return lemmatized_tokens

In [None]:
# Function that handles negations by adding NOT_ as a prefix to the word that follows a negation word
def handle_negations(tokens):
    negation_words = ["not", "no", "never", "none", "nothing", "neither", "nowhere", "no one",
                 "nobody", "don’t", "doesn’t", "isn’t", "aren’t", "wasn’t", "weren’t",
                 "can’t", "couldn’t", "shouldn’t", "won’t", "wouldn’t", "hasn't", "haven't",
                 "hadn't", "doesn't", "don't", "did not", "cannot", "will not", "shall not",
                 "should not", "would not", "could not", "might not", "must not", "is not",
                 "are not", "was not", "were not", "have not", "has not", "had not", "fail to",
                 "not only... but also", "neither... nor", "never a", "none of"]
    processed_tokens = []
    punctuation_marks = ['.', '?', '!', ':', ';', ',']
    negation = False

    for word in tokens:
        # If the word is a punctuation, reset the negation flag
        if word in punctuation_marks:
            negation = False
            processed_tokens.append(word)  # Keep punctuation
        elif negation:
            processed_tokens.append("NOT_" + word)
        else:
            processed_tokens.append(word)

        # Check if the current word is a negation word
        if word.lower() in negation_words or any(neg in word.lower() for neg in negation_words):
            negation = True

    return processed_tokens

### Pre-processing application to the dataframes

In [None]:
# Applying NER to both dataframes
df_test['review_entities'] = df_test['review'].apply(lambda x: ner(x))
df_train['review_entities'] = df_train['review'].apply(lambda x: ner(x))

In [None]:
# Check the changes in test dataframe
df_test.head()

In [None]:
# Check the changes in train dataframe
df_train.head()

In [None]:
# Case Folding the reviews
df_test['review_lc'] = df_test['review'].apply(lambda x: lowercase_text(x))
df_train['review_lc'] = df_train['review'].apply(lambda x: lowercase_text(x))

In [None]:
# Check the changes in test dataframe
df_test.head()

In [None]:
# Apply the preprocess of contractions
df_test['review_pp'] = df_test['review_lc'].apply(lambda x : preprocess_text(x))

df_train['review_pp'] = df_train['review_lc'].apply(lambda x : preprocess_text(x))


In [None]:
# Check the changes in test dataframe
df_test.head()

In [None]:
df_train.head()

In [None]:
# Applying POS_tagging to the reviews using spacy
df_test['tags_spacy'] = df_test['review_pp'].apply(lambda x: pos_tag_text(x, 'spacy'))
df_train['tags_spacy'] = df_train['review_pp'].apply(lambda x: pos_tag_text(x, 'spacy'))

In [None]:
# Check the changes in test dataframe
df_test.head()

In [None]:
# Tokenization of the reviews using spacy
df_test['review_tokenized'] = df_test.apply(lambda x: tokenize_text(x['review_pp'], 'spacy', x['tags_spacy']), axis=1)
df_train['review_tokenized'] = df_train.apply(lambda x: tokenize_text(x['review_pp'], 'spacy', x['tags_spacy']), axis=1)

In [None]:
# Check the changes in test dataframe
df_test.head()

In [None]:
# Lemmatization of reviews using spacy
df_test['review_lemmatized'] = df_test['review_tokenized'].apply(lambda x: lemmatize_text(" ".join(x), 'spacy'))
df_train['review_lemmatized'] = df_train['review_tokenized'].apply(lambda x: lemmatize_text(" ".join(x), 'spacy'))

In [None]:
# Check the changes in test dataframe
df_test.head()

In [None]:
# Negation Handling 
df_test['negations_review'] = df_test['review_lemmatized'].apply(lambda x: handle_negations(x))
df_train['negations_review'] = df_train['review_lemmatized'].apply(lambda x: handle_negations(x))

#df_test['review_test'] = df_test['review_test'].apply(lambda x: remove_stopwords(x, 'spacy'))

In [None]:
# Check the changes in test dataframe
df_test

In [None]:
# Check if negations are correctly applied using row 1724
df_test['negations_review'].iloc[1724]

In [None]:
# Check if tags are correctly identified using row 4
df_test["tags_spacy"].iloc[4]

#### Given the computational capacity needed it's advised to use google collab to run the pre-processing fase.
- The following code extracts the transformed dataframes from google collab into the Drive.

In [None]:
# Import drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Update these paths to match the exact location of your files in Drive
test_file_path_download = '/content/drive/My Drive/df_test.csv'
train_file_path_donwload = '/content/drive/My Drive/df_train.csv'

In [None]:
df_test.to_csv(test_file_path_download, index=False)
df_train.to_csv(train_file_path_donwload, index=False)

#### Loading the transformed data - 2 options
- From google collab
- From file location

In [None]:
# Update these paths to match the exact location of your files in Drive
dftest_file_path = '/content/drive/My Drive/df_test.csv'
dftrain_file_path= '/content/drive/My Drive/df_train.csv'

# Load the data
df_test = pd.read_csv(dftest_file_path, sep=',', converters={
        'review_entities': literal_eval,
        'tags_spacy': literal_eval,
        'review_tokenized': literal_eval,
        'review_lemmatized': literal_eval,
        'negations_review': literal_eval
    })
df_train = pd.read_csv(dftrain_file_path, sep=',',converters={
        'review_entities': literal_eval,
        'tags_spacy': literal_eval,
        'review_tokenized': literal_eval,
        'review_lemmatized': literal_eval,
        'negations_review': literal_eval
    })

In [None]:
# Load locally
df_test = pd.read_csv(
    'df_test.csv',
    converters={
        'review_entities': literal_eval,
        'tags_spacy': literal_eval,
        'review_tokenized': literal_eval,
        'review_lemmatized': literal_eval,
        'negations_review': literal_eval
    }
)

df_train = pd.read_csv(
    'df_train.csv',
    converters={
        'review_entities': literal_eval,
        'tags_spacy': literal_eval,
        'review_tokenized': literal_eval,
        'review_lemmatized': literal_eval,
        'negations_review': literal_eval
    }
)

## Application of a Lexicon (EmoLex) for sentiment classification

In [None]:
# Load the lexicon via url
lexname="https://raw.githubusercontent.com/fmmb/Text-Mining/main/data/NRC-lexicon.csv"

# Load the lexicon via file 
#lexname = "NRC-lexicon.csv"

lexicon = pd.read_csv(lexname, encoding="utf-8", index_col=["English"])

In [None]:
# Check the lexicon
print(f"Lexicon Size: {len(lexicon)} words")
display(lexicon.sample(5))

In [None]:
# Convert to dictionary for eficiency
lex = (lexicon['Positive']-lexicon['Negative']).to_dict()

In [None]:
# Function to classify sentiment based on a given lexicon
def lex_sentiment(review, lexicon):
    score = 0
    words = review.split() if isinstance(review, str) else review
    for word in words:
        score += lexicon.get(word, 0)
        # print(word, lexicon.get(word, 0))
    #print("Sum:", score)

    # Classify sentiment based on the sentiment score
    if score >= 0.5:
        return 1   # Positive sentiment
    elif score <= -0.5:
        return -1   # Negative sentiment
    else:
        return 0  # Neutral sentiment


## Experiments with the lexicon
1. Without any Pre-Processing
2. Lowercasing and handling contractions
3. With Tokezination and POS tagging (SpaCy, NLTK, Stanza)
4. With Stop word removal (SpaCy, NLTK, Stanza)
5. With Lemmatization (SpaCy, NLTK, Stanza)
6. With Negation Handling

### 1. Without any Pre-Processing

In [None]:
# Create dataframe with results from the experiments with the lexicon
lex_test = pd.DataFrame()

In [None]:
# Apply lexicon to reviews without any preprocessing
lex_test['No_Processing'] = df_test['review'].apply(lambda x: lex_sentiment(x, lex))

In [None]:
lex_test['No_Processing'].value_counts()

In [None]:
# Evaluation of the experiment
print(classification_report(df_test['sentiment_numeric'], lex_test['No_Processing']))

### 2. Lowercasing and handling contractions

In [None]:
# Applying the lexicon to the normalized reviews
lex_test['With_Normalization_sentiment'] = df_test["review_pp"].apply(lambda x: lex_sentiment(x, lex))

In [None]:
# Evaluation of the experiment
print(classification_report(df_test['sentiment_numeric'], lex_test['With_Normalization_sentiment']))

### 3. With Tokezination and POS tagging (SpaCy, NLTK, Stanza)

In [None]:
# Applying the lexicon to the tokenized reviews
lex_test['With_Tokenization_sentiment'] = df_test["review_tokenized"].apply(lambda x: lex_sentiment(x, lex))

In [None]:
# Evaluation of the experiment
print(classification_report(df_test['sentiment_numeric'], lex_test['With_Tokenization_sentiment']))

### 4. With Stop word removal (SpaCy, NLTK, Stanza)

In [None]:
# Removing stop words 
df_test['review_tokenized_no_stopwords'] = df_test['review_tokenized'].apply(lambda x: remove_stopwords(x, 'nltk'))

In [None]:
# Applying the lexicon to the tokenized reviews without stop words
lex_test['Without_stopwords_sentiment'] = df_test['review_tokenized_no_stopwords'].apply(lambda x: lex_sentiment(x, lex))

In [None]:
print(classification_report(df_test['sentiment_numeric'], lex_test['Without_stopwords_sentiment']))

### 5. With Lemmatization (SpaCy, NLTK, Stanza)

In [None]:
# Applying the lexicon to the lematized reviews
lex_test['With_Lemma_sentiment'] = df_test['review_lemmatized'].apply(lambda x: lex_sentiment(x, lex))

In [None]:
# Evaluation of the experiment
print(classification_report(df_test['sentiment_numeric'], lex_test['With_Lemma_sentiment']))

### 6. With Negation Handling

In [None]:
# Define the classify_sentiment function to handle negations alterations
def classify_sentiment_withneg(words):
    score = 0
    for w in words:
        score += lex.get(w, 0) * (1.5 if w.startswith("NOT_") else 1)
    if score >= 1:
        return 1
    elif -1 < score < 1:
        return 0
    elif score <= -1:
        return -1

In [None]:
# Applying the lexicon negation adapted to the negation handled reviews
lex_test['With_Negation_sentiment'] = df_test['negations_review'].apply(lambda x: classify_sentiment_withneg(x))

In [None]:
# Evaluation of experiement
print(classification_report(df_test['sentiment_numeric'], lex_test['With_Negation_sentiment']))

In [None]:
lex_test

## Machine Learning

### Feature Extraction - Bag of words

In [None]:
train_features = pd.DataFrame()
test_features = pd.DataFrame()

In [None]:
# Function to create the bag of words using Counter()
def bag_of_words(text):
    return Counter(text)

In [None]:
df_train['review_lemmatized'].iloc[0]

In [None]:
# Application of bag of words function do the reviews
df_test['bag_ow'] = df_test['review_lemmatized'].apply(lambda x: bag_of_words(x))
df_train['bag_ow'] = df_train['review_lemmatized'].apply(lambda x: bag_of_words(x))

In [None]:
df_train.head()

In [None]:
X_train = df_train['review_lemmatized'].apply(lambda x: ' '.join(x))
X_test = df_test['review_lemmatized'].apply(lambda x: ' '.join(x))
y_train = df_train['sentiment_numeric']
y_test = df_test['sentiment_numeric']

# Append the features (X_train, y_train) to the features df
train_features['bow'] = X_train
test_features['bow'] = X_test


#### Logistic Regression with Bag oF words

In [None]:
# Create a pipeline with CountVectorizer and LogisticRegression
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', LogisticRegression())
])

# Fit the pipeline on training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

#### Naive Bayes with bag of words

In [None]:
# Pipeline to perform Multinomial Naive Bayes
pipeline_nb = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB())
])

# Fit the pipeline on training data
pipeline_nb.fit(X_train, y_train)

# Predict on the test data
y_pred_nb = pipeline_nb.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred_nb))

#### SVM with bag of words

In [None]:
# Pipeline to perform SVM
pipeline_svm = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', SVC())
])

# Fit the pipeline on your training data
pipeline_svm.fit(X_train, y_train)

# Predict on the test data
y_pred_svm = pipeline_svm.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred_svm))

### Feature Extraction - Term Frequency-Inverse Document Frequency (TF-IDF)

In [None]:
# Remove stopwords
df_test['review_lemmatized_no_sw'] = df_test['review_lemmatized'].apply(lambda x: remove_stopwords(x, 'nltk'))
df_train['review_lemmatized_no_sw'] = df_train['review_lemmatized'].apply(lambda x: remove_stopwords(x, 'nltk'))

In [None]:
# Convert each entry to a string if they are not already
df_train['review_lemmatized_no_sw'] = df_train['review_lemmatized_no_sw'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
df_test['review_lemmatized_no_sw'] = df_test['review_lemmatized_no_sw'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

In [None]:
df_test['review_lemmatized_no_sw']

In [None]:
X_train = df_train['review_lemmatized_no_sw']
y_train = df_train['sentiment_numeric']
X_test = df_test['review_lemmatized_no_sw']
y_test = df_test['sentiment_numeric']

# Append the features (X_train, y_train) to the features df
train_features['tf_idf'] = X_train
test_features['tf_idf'] = X_test

#### Logistic Regression with TF-IDF

In [None]:
# Pipeline to perform Logistic Regression
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', LogisticRegression())
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

#### Naive Bayes with TF-IDF

In [None]:
# Pipeline to perform Multinomial Naive Bayes
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MultinomialNB())
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

#### SVM with TF-IDF

In [None]:
# Pipeline to perform SVM
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', SVC())
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

### Feature Extraction - Word Embeddings

In [None]:
# Function that loads glove embeddings
def load_glove_embeddings(path):
    embeddings_index = {}
    with open(path, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

In [None]:
# Load glove for word embeddings
glove_path = 'glove.6B.100d.txt'
embeddings_index = load_glove_embeddings(glove_path)

In [None]:
def tokens_to_avg_vectors(word_list, embeddings_index, embedding_dim=100):
    embeddings = [embeddings_index.get(word) for word in word_list if word in embeddings_index]
    
    if not embeddings:
        # If no words in the list have embeddings, return a zero vector
        return np.zeros(embedding_dim)
    
    embeddings = np.array(embeddings)
    # Calculate the mean vector for the document and return it
    return np.mean(embeddings, axis=0)


In [None]:
# Convert the Series of lists into a 2D NumPy array
X_train = np.vstack(df_train['review_lemmatized'].apply(lambda x: tokens_to_avg_vectors(x, embeddings_index)))
X_test = np.vstack(df_test['review_lemmatized'].apply(lambda x: tokens_to_avg_vectors(x, embeddings_index)))

In [None]:
X_train

#### Logistic Regression with Word Embeddings

In [None]:
# Pipeline to perform Logistic Regression with Word Embeddings
pipeline = Pipeline([
    ('classifier', LogisticRegression())
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

Note: Word Embeddings with Multinomial NB is not possible since this classifier doesn't accept negative values.

#### SVM with Word Embeddings

In [None]:
# Pipeline to perform SVM with Word Embeddings
pipeline = Pipeline([
    ('classifier', SVC())
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

### Feature Extraction - POS Tagging

In [None]:
df_test.head()

In [None]:
# Convert list of tuples to a flat list where each word is combined with its POS tag
def flatten_pos_tags(pos_list):
    return ' '.join([f'{word}_{pos}' for word, pos in pos_list])

# Apply the conversion on your DataFrame
X_train = df_train['tags_spacy'].apply(flatten_pos_tags)
X_test = df_test['tags_spacy'].apply(flatten_pos_tags)
y_train = df_train['sentiment_numeric']
y_test = df_test['sentiment_numeric']

# Append the features (X_train, y_train) to the features df
train_features['pos_tags'] = X_train
test_features['pos_tags'] = X_test


#### Logistic Regression with POS tagging

In [None]:
# Pipeline to perform Logistic Regression with POS-Tagging as feature
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', LogisticRegression())
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

#### Naive Bayes with POS Tagging

In [None]:
# Pipeline to perform Multinomial Naive Bayes with Pos tagging as feature
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB())
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

#### SVM with POS Tagging

In [None]:
# Pipeline to perform SVM with POS-Tagging as feature
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', SVC())
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

#### Tests With Multiple Features

In [None]:
train_features.head()

In [None]:
test_features.head()

In [None]:
# Define the transformations
transformations = ColumnTransformer([
    ('bow_vectorizer', CountVectorizer(), 'bow'),         # Apply CountVectorizer to the 'bow' column
    ('tfidf_vectorizer', TfidfVectorizer(), 'tf_idf'),    # Apply TfidfVectorizer to the 'tf_idf' column
    ('pos_vectorizer', CountVectorizer(), 'pos_tags')     # Apply CountVectorizer to the 'pos_tags' column
])

In [None]:
# Define the transformations
transformations2 = ColumnTransformer([
    ('bow_vectorizer', CountVectorizer(), 'bow'),        
    ('tfidf_vectorizer', TfidfVectorizer(), 'tf_idf')
])

##### SVM with POS tagging, Bag of words and TF-IDF

In [None]:
# Create the pipeline
pipeline_multi = Pipeline([
    ('transformations', transformations),
    ('classifier', SVC())
])

# Fit the pipeline on the training data
pipeline_multi.fit(train_features[['bow', 'tf_idf', 'pos_tags']], df_train['sentiment_numeric'])

# Predict on the test data
y_pred = pipeline_multi.predict(test_features[['bow', 'tf_idf', 'pos_tags']])

# Evaluate the model using the true labels from the test set
print(classification_report(df_test['sentiment_numeric'], y_pred))

##### Naive Bayes with POS tagging, Bag of words and TF-IDF

In [None]:
# Create the pipeline
pipeline_multi = Pipeline([
    ('transformations', transformations),
    ('classifier', MultinomialNB())
])
# Fit the pipeline on the training data
pipeline_multi.fit(train_features[['bow', 'tf_idf', 'pos_tags']], df_train['sentiment_numeric'])

# Predict on the test data
y_pred = pipeline_multi.predict(test_features[['bow', 'tf_idf', 'pos_tags']])

# Evaluate the model
print(classification_report(df_test['sentiment_numeric'], y_pred))

##### Logistic Regression with POS tagging, Bag of words and TF-IDF

In [None]:
# Create the pipeline
pipeline_multi = Pipeline([
    ('transformations', transformations),
    ('classifier', LogisticRegression())
])
# Fit the pipeline on the training data
pipeline_multi.fit(train_features[['bow', 'tf_idf', 'pos_tags']], df_train['sentiment_numeric'])

# Predict on the test data
y_pred = pipeline_multi.predict(test_features[['bow', 'tf_idf', 'pos_tags']])

# Evaluate the model using the labels from the test set
print(classification_report(df_test['sentiment_numeric'], y_pred))

### Machine Learning with Negation

In [None]:
df_train['negations_review'].iloc[0]

In [None]:
# Application of bag of words function do the reviews
df_test['bag_ow_not'] = df_test['negations_review'].apply(lambda x: bag_of_words(x))
df_train['bag_ow_not'] = df_train['negations_review'].apply(lambda x: bag_of_words(x))

In [None]:
df_train.head()

In [None]:
X_train = df_train['negations_review'].apply(lambda x: ' '.join(x))
X_test = df_test['negations_review'].apply(lambda x: ' '.join(x))
y_train = df_train['sentiment_numeric']
y_test = df_test['sentiment_numeric']

# Append the features (X_train, y_train) to the features df
train_features['bow_neg'] = X_train
test_features['bow_neg'] = X_test


#### Logistic Regression with Bag of words and negations

In [None]:
# Create a pipeline with CountVectorizer and LogisticRegression
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', LogisticRegression())
])

# Fit the pipeline on training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

#### Naive Bayes with bag of words and negation

In [None]:
# Pipeline to perform Multinomial Naive Bayes
pipeline_nb = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB())
])

# Fit the pipeline on training data
pipeline_nb.fit(X_train, y_train)

# Predict on the test data
y_pred_nb = pipeline_nb.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred_nb))

#### SVM with bag of words and negation

In [None]:
# Pipeline to perform SVM
pipeline_svm = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', SVC())
])

# Fit the pipeline on your training data
pipeline_svm.fit(X_train, y_train)

# Predict on the test data
y_pred_svm = pipeline_svm.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred_svm))

### Feature Extraction - Term Frequency-Inverse Document Frequency (TF-IDF) with negation

In [None]:
# Remove stopwords
df_test['review_lemmatized_no_sw_neg'] = df_test['negations_review'].apply(lambda x: remove_stopwords(x, 'nltk'))
df_train['review_lemmatized_no_sw_neg'] = df_train['negations_review'].apply(lambda x: remove_stopwords(x, 'nltk'))

In [None]:
# Convert each entry to a string if they are not already
df_train['review_lemmatized_no_sw_neg'] = df_train['negations_review'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
df_test['review_lemmatized_no_sw_neg'] = df_test['negations_review'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

In [None]:
df_test['review_lemmatized_no_sw_neg']

In [None]:
X_train = df_train['review_lemmatized_no_sw_neg']
y_train = df_train['sentiment_numeric']
X_test = df_test['review_lemmatized_no_sw_neg']
y_test = df_test['sentiment_numeric']

# Append the features (X_train, y_train) to the features df
train_features['tf_idf_neg'] = X_train
test_features['tf_idf_neg'] = X_test

#### Logistic Regression with TF-IDF with negation

In [None]:
# Pipeline to perform Logistic Regression
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', LogisticRegression())
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

#### Naive Bayes with TF-IDF with negation

In [None]:
# Pipeline to perform Multinomial Naive Bayes
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MultinomialNB())
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

#### SVM with TF-IDF with negation

In [None]:
# Pipeline to perform SVM
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', SVC())
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

### Feature Extraction - Word Embeddings with negation

In [None]:
# Convert the Series of lists into a 2D NumPy array
X_train = np.vstack(df_train['negations_review'].apply(lambda x: tokens_to_avg_vectors(x, embeddings_index)))
X_test = np.vstack(df_test['negations_review'].apply(lambda x: tokens_to_avg_vectors(x, embeddings_index)))

In [None]:
X_train

#### Logistic Regression with Word Embeddings and negations

In [None]:
# Pipeline to perform Logistic Regression with Word Embeddings
pipeline = Pipeline([
    ('classifier', LogisticRegression())
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

#### SVM with Word Embeddings and negations

In [None]:
# Pipeline to perform SVM with Word Embeddings
pipeline = Pipeline([
    ('classifier', SVC())
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

In [None]:
df_test.head()

In [None]:
# Convert list of tuples to a flat list where each word is combined with its POS tag
def flatten_pos_tags(pos_list):
    return ' '.join([f'{word}_{pos}' for word, pos in pos_list])

# Apply the conversion on your DataFrame
X_train = df_train['tags_spacy'].apply(flatten_pos_tags)
X_test = df_test['tags_spacy'].apply(flatten_pos_tags)
y_train = df_train['sentiment_numeric']
y_test = df_test['sentiment_numeric']

# Append the features (X_train, y_train) to the features df
train_features['pos_tags'] = X_train
test_features['pos_tags'] = X_test


#### Logistic Regression with POS tagging with negation

In [None]:
# Pipeline to perform Logistic Regression with POS-Tagging as feature
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', LogisticRegression())
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

#### Naive Bayes with POS Tagging

In [None]:
# Pipeline to perform Multinomial Naive Bayes with Pos tagging as feature
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB())
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

#### SVM with POS Tagging

In [None]:
# Pipeline to perform SVM with POS-Tagging as feature
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', SVC())
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

### Tests With Multiple Features

In [None]:
train_features.head()

In [None]:
test_features.head()

In [None]:
# Define the transformations
transformations = ColumnTransformer([
    ('bow_vectorizer', CountVectorizer(), 'bow_neg'),         # Apply CountVectorizer to the 'bow' column
    ('tfidf_vectorizer', TfidfVectorizer(), 'tf_idf_neg'),    # Apply TfidfVectorizer to the 'tf_idf' column
    ('pos_vectorizer', CountVectorizer(), 'pos_tags')     # Apply CountVectorizer to the 'pos_tags' column
])

In [None]:
# Define the transformations
transformations2 = ColumnTransformer([
    ('bow_vectorizer', CountVectorizer(), 'bow_neg'),        
    ('tfidf_vectorizer', TfidfVectorizer(), 'tf_idf_neg')
])

##### SVM with POS tagging, Bag of words and TF-IDF and negations

In [None]:
# Create the pipeline
pipeline_multi = Pipeline([
    ('transformations', transformations),
    ('classifier', SVC())
])

# Fit the pipeline on the training data
pipeline_multi.fit(train_features[['bow_neg', 'tf_idf_neg', 'pos_tags']], df_train['sentiment_numeric'])

# Predict on the test data
y_pred = pipeline_multi.predict(test_features[['bow_neg', 'tf_idf_neg', 'pos_tags']])

# Evaluate the model using the true labels from the test set
print(classification_report(df_test['sentiment_numeric'], y_pred))

##### Naive Bayes with POS tagging, Bag of words and TF-IDF with negation

In [None]:
# Create the pipeline
pipeline_multi = Pipeline([
    ('transformations', transformations),
    ('classifier', MultinomialNB())
])
# Fit the pipeline on the training data
pipeline_multi.fit(train_features[['bow_neg', 'tf_idf_neg', 'pos_tags']], df_train['sentiment_numeric'])

# Predict on the test data
y_pred = pipeline_multi.predict(test_features[['bow_neg', 'tf_idf_neg', 'pos_tags']])

# Evaluate the model
print(classification_report(df_test['sentiment_numeric'], y_pred))

##### Logistic Regression with POS tagging, Bag of words and TF-IDF

In [None]:
# Create the pipeline
pipeline_multi = Pipeline([
    ('transformations', transformations),
    ('classifier', LogisticRegression())
])
# Fit the pipeline on the training data
pipeline_multi.fit(train_features[['bow_neg', 'tf_idf_neg', 'pos_tags']], df_train['sentiment_numeric'])

# Predict on the test data
y_pred = pipeline_multi.predict(test_features[['bow_neg', 'tf_idf_neg', 'pos_tags']])

# Evaluate the model using the labels from the test set
print(classification_report(df_test['sentiment_numeric'], y_pred))

## Transformers

####  Experiment with pre defined pipelines

In [None]:
# Change the DataFrame to Dataset
train_ds = Dataset.from_pandas(df_train)
test_ds = Dataset.from_pandas(df_test)

In [None]:
# Load your sentiment analysis model
classifier = pipeline('sentiment-analysis', model='distilbert-base-uncased')

# Get predictions directly using the classifier on the review texts
predictions = classifier(list(df_test['review']))

# Define a function to convert LABEL_X to the desired format
def convert_labels(predictions):
    label_mapping = {'LABEL_0': -1, 'LABEL_1': 0, 'LABEL_2': 1}
    return [label_mapping[prediction['label']] for prediction in predictions]

# Extract and convert labels from predictions
converted_labels = convert_labels(predictions)

# Print results
print("Original Labels:", [pred['label'] for pred in predictions])
print("Converted Labels:", converted_labels)

# Print the classification report
print(classification_report( converted_labels,df_test['sentiment_numeric']))


#### Fine tunning the model with rotten tomatoes data

In [None]:
# Function to tokenize the dataset and add labels
def tokenize_and_add_labels(dataset, tokenizer):
    # Tokenize the text and align with the model's expected input
    tokenized_inputs = tokenizer(
        dataset['review'],
        padding='max_length',  # Pad or truncate to a maximum length
        truncation=True,
        max_length=512
    )
    # Add a 'labels' field to use with the model
    tokenized_inputs['labels'] = dataset['sentiment_numeric']
    return tokenized_inputs

In [None]:
# Function to compute metrics of the given model 

def compute_metrics(eval_pred):
    # Extract logits and labels from the evaluation prediction
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    # Compute and print classification report
    report = classification_report(labels, predictions, target_names=["LABEL_0", "LABEL_1", "LABEL_2"], output_dict=True)
    
    # Simplify the output to match the previous structure if needed
    return {
        "accuracy": report['accuracy'],
        "f1": report['macro avg']['f1-score'],
        "precision": report['macro avg']['precision'],
        "recall": report['macro avg']['recall']
    }

In [None]:
# Main training function
def main(df_train, df_test):

    # Adjust labels in DataFrame
    df_train['sentiment_numeric'] += 1
    df_test['sentiment_numeric'] += 1
    
    # Convert DataFrames to Hugging Face dataset format
    train_ds = Dataset.from_pandas(df_train)
    test_ds = Dataset.from_pandas(df_test)

    # Load the tokenizer
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

    # Tokenize data
    tokenized_train = train_ds.map(lambda x: tokenize_and_add_labels(x, tokenizer), batched=True)
    tokenized_test = test_ds.map(lambda x: tokenize_and_add_labels(x, tokenizer), batched=True)

    # Model and training setup
    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=2,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        warmup_steps=300,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch"
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_test,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    
    # Train and evaluate the model
    trainer.train()
    eval_results = trainer.evaluate()
    print(f"Evaluation results: {eval_results}")
    

In [None]:
model = main(df_train, df_test)

In [None]:
# install if error in model = main(df_train, df_test)
#pip install accelerate -U

## Generative Model

In [2]:
# Load the dataset
data = pd.read_csv('notebooks-challenge.txt', delimiter='\t', header=None)
data.columns = ['review']

# Preprocess the data (example: lowercasing)
data['review'] = data['review'].str.lower()

In [3]:
df_train = pd.read_csv('notebooks-train.csv', delimiter='\t')

In [4]:
data.head()

Unnamed: 0,review
0,"o note é bom, mas não superou as expectativas ..."
1,ele corresponde ao valor pago. custando em méd...
2,"o notebook é muito bom, o windows 10 ferra um ..."
3,"o notebook tem uma tela enorme, muito útil par..."
4,review_text


In [15]:
#openai.api_key = ''

# Function to get sentiment that uses gpt-3.5 openai model and returns lables pos or neg
def get_sentiment(text):
    response = openai.Completion.create(
        engine="gpt-3.5-turbo-instruct",
        prompt=f"Sentiment analysis of the following text:\n{text}\n",
        temperature=0.7,  # More nuanced responses
        max_tokens=5,  # Allow more tokens to capture terms like 'neutral' or 'neu'
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        stop=["\n"]
    )

    sentiment = response.choices[0].text.strip().lower()
    return 'pos' if 'pos' in sentiment else 'neg'

In [16]:
# If error install
#!pip install openai==0.28

In [17]:
# Apply the function to get labels
tqdm.pandas()
data['sentiment'] = data['review'].progress_apply(lambda x: get_sentiment(x))

100%|██████████| 422/422 [02:04<00:00,  3.38it/s]


In [18]:
# Take sample for comparison in classification report
data_sample = df_train.sample(n=422, random_state=1) 

In [19]:
# Evaluation of approach
print(classification_report(data_sample['sentiment'], data['sentiment']))

              precision    recall  f1-score   support

         neg       0.21      1.00      0.35        88
         pos       1.00      0.02      0.05       334

    accuracy                           0.23       422
   macro avg       0.61      0.51      0.20       422
weighted avg       0.84      0.23      0.11       422



In [None]:
# Change column names to match with notebook-train.csv
columns = ['review_text', 'sentiment']
data.columns = columns

In [None]:
data = data[['sentiment', 'review_text']]

In [None]:
data

In [None]:
# Export dataframe 
data.to_csv("notebook-challenge.csv", header=True, index=False)