In [1]:
import pandas as pd
import re
from collections import Counter, defaultdict
from nltk.util import ngrams
import numpy as np

reviews_df = pd.read_csv('/Users/mazinrafi/Downloads/AllReviews.csv')

def simple_tokenizer(text): #
    text = str(text)
    if text == 'nan':
        return [] 
    text = re.sub(r'<[^>]+>', ' ', text)  # Remove HTML tags. Unlikely needed although a double check wouldn't hurt. 
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)  # Filter to allow only alphabet letters
    text = text.lower()  # Convert to lower case
    # Tokenize by splitting the sentences into words
    tokens = text.split()
    return tokens

# Tokenize the reviews. Replace 'Review' column depending on. 
reviews_df['Review'] = reviews_df['Review'].astype(str)
tokenized_reviews = reviews_df['Review'].apply(simple_tokenizer)
flat_token_list = [token for sublist in tokenized_reviews for token in sublist]

# Generate unigrams, bigrams, and trigrams from the flattened token list
unigrams = flat_token_list
bigrams = list(ngrams(flat_token_list, 2))
trigrams = list(ngrams(flat_token_list, 3))

# Count the frequencies of each n-gram. 
unigram_counts = Counter(unigrams)
bigram_counts = Counter(bigrams)
trigram_counts = Counter(trigrams)

# Display the most common n-grams. We expect words like "the" and "a" to be the most common. 
print('Most common unigrams:', unigram_counts.most_common(5))
print('Most common bigrams:', bigram_counts.most_common(5))
print('Most common trigrams:', trigram_counts.most_common(5))


Most common unigrams: [('the', 1317375), ('and', 636813), ('a', 636231), ('of', 571970), ('to', 528662)]
Most common bigrams: [(('of', 'the'), 152916), (('in', 'the'), 99336), (('this', 'movie'), 60857), (('and', 'the'), 52915), (('is', 'a'), 51707)]
Most common trigrams: [(('one', 'of', 'the'), 19310), (('this', 'movie', 'is'), 10226), (('of', 'the', 'film'), 9588), (('this', 'is', 'a'), 9418), (('a', 'lot', 'of'), 9299)]


In [2]:
import random
# Function to generate a sentence using the unigram model
def generate_sentence_unigram(unigram_counts, num_words=10): #change number accordingly
    # Select num_words words based on their frequency probability distribution
    words = [word for word in unigram_counts.keys()]
    word_probabilities = [unigram_counts[word] for word in words]
    generated_words = [random.choices(words, weights=word_probabilities)[0] for _ in range(num_words)]
    return ' '.join(generated_words)



#Generate sentences using a unigram model. 
print("Unigram model generated sentence:")
print(generate_sentence_unigram(unigram_counts))


Unigram model generated sentence:
lot it of hope of ninja about are someone i


In [3]:
bigram_counts = defaultdict(Counter)

for w1, w2 in bigrams:
    bigram_counts[w1][w2] += 1

# Convert the counts to probabilities for the bigram model
for w1 in bigram_counts:
    total_count = float(sum(bigram_counts[w1].values()))
    for w2 in bigram_counts[w1]:
        bigram_counts[w1][w2] /= total_count

def generate_sentence_bigram(bigram_counts, seed_word, num_words=10):
    current_word = seed_word
    sentence = [current_word]
    for _ in range(num_words - 1):  # already have seed word
        next_words = list(bigram_counts[current_word].keys())
        if not next_words:
            break
        next_word_weights = list(bigram_counts[current_word].values())
        next_word = random.choices(next_words, weights=next_word_weights)[0]
        sentence.append(next_word)
        current_word = next_word
    return ' '.join(sentence)

# Generate a sentence using the bigram model with a seed word
seed_word = 'great'  
generated_sentence = generate_sentence_bigram(bigram_counts, seed_word)
generated_sentence

#expected to output different results. 

'great piano these disjointed story initially munho is often put'

In [4]:
trigram_counts = defaultdict(Counter)

for w1, w2, w3 in trigrams:
    trigram_counts[(w1, w2)][w3] += 1

for w1_w2 in trigram_counts:
    total_count = sum(trigram_counts[w1_w2].values())
    for w3 in trigram_counts[w1_w2]:
        trigram_counts[w1_w2][w3] /= total_count

def generate_sentence_trigram(trigram_counts, start_bigram, num_words=10):
    if start_bigram not in trigram_counts:
        return ' '.join(start_bigram)

    current_bigram = start_bigram
    sentence = [current_bigram[0], current_bigram[1]]
    for _ in range(num_words - 2):  # minus 2 because we already have the start_bigram
        next_words = list(trigram_counts[current_bigram].keys())
        weights = list(trigram_counts[current_bigram].values())
        next_word = random.choices(next_words, weights=weights)[0]
        sentence.append(next_word)
        current_bigram = (current_bigram[1], next_word)

    return ' '.join(sentence)

# Choose a random start bigram
start_bigram = random.choice(list(trigram_counts.keys())) #Alternatively start_bigram=('word1','word2')
print("Trigram model generated sentence starting with bigram '{} {}':".format(*start_bigram))
print(generate_sentence_trigram(trigram_counts, start_bigram))


Trigram model generated sentence starting with bigram 'uncomplicated morality':
uncomplicated morality tale starring henry fonda as a rival camp


In [5]:
from math import pow, log
sentence = "I don't think this movie is that good." #Need a longer example to test.
test_data = simple_tokenizer(sentence)
# Function to calculate perplexity for unigram model
def calculate_perplexity_unigram(test_data, unigram_counts, total_unigrams):
    perplexity = 1
    N = 0
    for word in test_data:
        N += 1
        probability = unigram_counts.get(word, 0) / total_unigrams
        if probability > 0:
            perplexity = perplexity * (1 / probability)
    perplexity = pow(perplexity, 1/float(N))
    return perplexity

# Total number of unigrams (needed for unigram perplexity calculation)
total_unigrams = sum(unigram_counts.values())

# Calculating perplexity for unigram model
perplexity_unigram = calculate_perplexity_unigram(test_data, unigram_counts, total_unigrams)
perplexity_unigram


171.13008963881836

In [6]:
# Bigram Perplexity Calculation
def calculate_perplexity_bigram(test_data, bigram_counts):
    perplexity = 1
    N = 0
    for i in range(len(test_data) - 1):
        N += 1
        bigram = (test_data[i], test_data[i + 1])
        bigram_probability = bigram_counts[test_data[i]].get(test_data[i + 1], 0)
        if bigram_probability > 0:
            perplexity = perplexity * (1 / bigram_probability)
        else:
            perplexity = perplexity * (1 / total_unigrams)  # Smoothing for unseen bigrams
    perplexity = pow(perplexity, 1/float(N - 1))
    return perplexity

# Calculating perplexity for bigram model
perplexity_bigram = calculate_perplexity_bigram(test_data, bigram_counts)
perplexity_bigram


42.086343146031055

In [7]:
# Function to calculate perplexity for trigram model
def calculate_perplexity_trigram(test_data, trigram_counts):
    perplexity = 1
    N = 0
    for i in range(len(test_data) - 2):
        N += 1
        trigram = (test_data[i], test_data[i + 1], test_data[i + 2])
        trigram_probability = trigram_counts[(test_data[i], test_data[i + 1])].get(test_data[i + 2], 0)
        if trigram_probability > 0:
            perplexity = perplexity * (1 / trigram_probability)
        else:
            perplexity = perplexity * (1 / total_unigrams)  # Smoothing for unseen trigrams
    perplexity = pow(perplexity, 1/float(N - 2))
    return perplexity
# Calculating perplexity for trigram model
perplexity_trigram = calculate_perplexity_trigram(test_data, trigram_counts)
perplexity_trigram


66.76777564595017

In [8]:
print(perplexity_unigram)
print(perplexity_bigram)
print(perplexity_trigram)

171.13008963881836
42.086343146031055
66.76777564595017


In [9]:
###Unigram Baseline 1: Logistic Regression

import pandas as pd
import random

def generate_synthetic_review_unigram(unigram_counts, num_words=50):
    words = list(unigram_counts.keys())
    word_probabilities = [unigram_counts[word] for word in words]
    review = [random.choices(words, weights=word_probabilities)[0] for _ in range(num_words)]
    return ' '.join(review)

# Generate synthetic reviews
num_synthetic_reviews = 100  # adjust as needed
synthetic_reviews = [generate_synthetic_review_unigram(unigram_counts) for _ in range(num_synthetic_reviews)]
actual_reviews = reviews_df['Review'].sample(num_synthetic_reviews).tolist()

# Combine and label data
combined_reviews = synthetic_reviews + actual_reviews
labels = [0] * len(synthetic_reviews) + [1] * len(actual_reviews)  # 0 for synthetic, 1 for actual

# Create a DataFrame
data_df = pd.DataFrame({'Review': combined_reviews, 'Label': labels})

from sklearn.feature_extraction.text import CountVectorizer

# Using Bag-of-Words model
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data_df['Review'])
y = data_df['Label']

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
lr_model1 = LogisticRegression()
lr_model1.fit(X_train, y_train)

# Evaluate the model
predictions = lr_model1.predict(X_test)
print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

           0       0.95      1.00      0.98        21
           1       1.00      0.95      0.97        19

    accuracy                           0.97        40
   macro avg       0.98      0.97      0.97        40
weighted avg       0.98      0.97      0.97        40



In [10]:
###Unigram Baseline 2: Random Forest
#Unigram
import pandas as pd
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Train Random Forest model
rf_model1 = RandomForestClassifier()
rf_model1.fit(X_train, y_train)

# Evaluate the model
predictions = rf_model1.predict(X_test)
print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

           0       0.91      1.00      0.95        21
           1       1.00      0.89      0.94        19

    accuracy                           0.95        40
   macro avg       0.96      0.95      0.95        40
weighted avg       0.95      0.95      0.95        40



In [11]:
####Unigram Baseline 3: Feed Forward
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Define parameters
vocab_size = 10000
embedding_dim = 16
max_length = 100
padding_type = 'post'
trunc_type = 'post'
oov_tok = "<OOV>"

# Tokenize and pad the reviews
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(data_df['Review'])
sequences = tokenizer.texts_to_sequences(data_df['Review'])
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Prepare the labels
labels = data_df['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded, labels, test_size=0.2, random_state=42)

# Build the model
fnn_model1 = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    GlobalAveragePooling1D(),
    Dense(24, activation='relu'),
    Dense(1, activation='sigmoid')
])

fnn_model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
epochs = 10  # Adjust as needed
batch_size = 32  # Adjust as needed

fnn_model1.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), verbose=1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x4574da4d0>

In [12]:
####Unigram Baseline 4: Recurrent Neural Network
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Define parameters
vocab_size = 10000
embedding_dim = 16
max_length = 100
padding_type = 'post'
trunc_type = 'post'
oov_tok = "<OOV>"
lstm_units = 32  # Number of LSTM units

# Tokenize and pad the reviews
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(data_df['Review'])
sequences = tokenizer.texts_to_sequences(data_df['Review'])
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Prepare the labels
labels = data_df['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded, labels, test_size=0.2, random_state=42)

# Build the RNN model
rnn_model1 = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    LSTM(lstm_units),
    Dense(24, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

rnn_model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
epochs = 10  # Adjust as needed
batch_size = 32  # Adjust as needed

rnn_model1.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), verbose=1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x4577cd930>

In [13]:
####Unigram Baseline 5: DistilBERT

from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch

# Tokenizer for DistilBERT
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Tokenizing the dataset
class ReviewsDataset(Dataset):
    def __init__(self, reviews, labels):
        self.encodings = tokenizer(reviews, truncation=True, padding=True, max_length=512)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Prepare the dataset
reviews = data_df['Review'].tolist()
labels = data_df['Label'].tolist()
dataset = ReviewsDataset(reviews, labels)

# Split the dataset into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# DistilBERT model
db_model1 = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Trainer
trainer1 = Trainer(
    model=db_model1,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer1.train()

from sklearn.metrics import classification_report, accuracy_score

# Make predictions on the test set
predictions = trainer1.predict(val_dataset)

# Predictions are in the logits format, so convert them to class predictions
preds = np.argmax(predictions.predictions, axis=-1)

# True labels
true_labels = predictions.label_ids

# Calculate accuracy
accuracy = accuracy_score(true_labels, preds)

# Detailed classification report
class_report = classification_report(true_labels, preds)

print("Accuracy:", accuracy)
print("Classification Report:\n", class_report)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        23
           1       1.00      1.00      1.00        17

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40



In [14]:
####Bigram Baseline 1: Logistic Regression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


def generate_synthetic_review_bigram(bigram_counts, num_words=50):
    if not bigram_counts:
        return ""
    
    # Start with a random word
    current_word = random.choice(list(bigram_counts.keys()))
    review = [current_word]

    for _ in range(num_words - 1):
        next_words = list(bigram_counts[current_word].keys())
        next_word_weights = list(bigram_counts[current_word].values())
        next_word = random.choices(next_words, weights=next_word_weights)[0]
        review.append(next_word)
        current_word = next_word

    return ' '.join(review)

# Generate synthetic reviews using bigram model
num_synthetic_reviews = 100  # adjust as needed
synthetic_reviews_bigram = [generate_synthetic_review_bigram(bigram_counts) for _ in range(num_synthetic_reviews)]
actual_reviews = reviews_df['Review'].sample(num_synthetic_reviews).tolist()

# Combine and label data
combined_reviews = synthetic_reviews_bigram + actual_reviews
labels = [0] * len(synthetic_reviews_bigram) + [1] * len(actual_reviews)  # 0 for synthetic, 1 for actual

# Create a DataFrame
data_df = pd.DataFrame({'Review': combined_reviews, 'Label': labels})

# Using Bag-of-Words model with bigrams
vectorizer = CountVectorizer(ngram_range=(2, 2))
X = vectorizer.fit_transform(combined_reviews)
y = data_df['Label']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
lr_model2 = LogisticRegression()
lr_model2.fit(X_train, y_train)

# Evaluate the model
predictions = lr_model2.predict(X_test)
print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

           0       0.75      1.00      0.86        21
           1       1.00      0.63      0.77        19

    accuracy                           0.82        40
   macro avg       0.88      0.82      0.82        40
weighted avg       0.87      0.82      0.82        40



In [15]:
####Bigram Baseline 2: Random Forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Train the Random Forest model
rf_model2 = RandomForestClassifier()
rf_model2.fit(X_train, y_train)

# Evaluate the model
predictions = rf_model2.predict(X_test)
print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

           0       0.66      1.00      0.79        21
           1       1.00      0.42      0.59        19

    accuracy                           0.73        40
   macro avg       0.83      0.71      0.69        40
weighted avg       0.82      0.72      0.70        40



In [16]:
####Bigram Baseline 3: Feed Forward

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Function to create bigrams
def create_bigrams(texts):
    bigram_texts = []
    for text in texts:
        tokens = text.split()
        bigrams = [' '.join(tokens[i:i+2]) for i in range(len(tokens) - 1)]
        bigram_texts.append(' '.join(bigrams))
    return bigram_texts

# Apply bigram creation on the dataset
bigram_reviews = create_bigrams(data_df['Review'].tolist())

# Define parameters
vocab_size = 10000  # Adjust as needed
embedding_dim = 16
max_length = 200  # Adjusted for bigrams
padding_type = 'post'
trunc_type = 'post'
oov_tok = "<OOV>"

# Tokenize and pad the reviews with bigrams
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(bigram_reviews)
sequences = tokenizer.texts_to_sequences(bigram_reviews)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Prepare the labels
labels = data_df['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded, labels, test_size=0.2, random_state=42)

# Build the model
fnn_model2 = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    GlobalAveragePooling1D(),
    Dense(24, activation='relu'),
    Dense(1, activation='sigmoid')
])

fnn_model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
epochs = 10  # Adjust as needed
batch_size = 32  # Adjust as needed

fnn_model2.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), verbose=1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x457493b20>

In [17]:
####Bigram Baseline 4: Recurrent Neural Network

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Function to create bigrams
def create_bigrams(texts):
    bigram_texts = []
    for text in texts:
        tokens = text.split()
        bigrams = [' '.join(tokens[i:i+2]) for i in range(len(tokens) - 1)]
        bigram_texts.append(' '.join(bigrams))
    return bigram_texts

# Apply bigram creation on the dataset
bigram_reviews = create_bigrams(data_df['Review'].tolist())

# Define parameters
vocab_size = 10000  # Adjust as needed
embedding_dim = 16
max_length = 200  # Adjusted for bigrams
padding_type = 'post'
trunc_type = 'post'
oov_tok = "<OOV>"
lstm_units = 32  # Number of LSTM units

# Tokenize and pad the reviews with bigrams
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(bigram_reviews)
sequences = tokenizer.texts_to_sequences(bigram_reviews)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Prepare the labels
labels = data_df['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded, labels, test_size=0.2, random_state=42)

# Build the RNN model
rnn_model2 = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    LSTM(lstm_units),
    Dense(24, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

rnn_model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
epochs = 10  # Adjust as needed
batch_size = 32  # Adjust as needed

rnn_model2.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), verbose=1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x444ac7400>

In [18]:
####Bigram Baseline 5: DistilBERT

from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch

# Function to create bigrams
def create_bigrams(texts):
    bigram_texts = []
    for text in texts:
        tokens = text.split()
        bigrams = [' '.join(tokens[i:i+2]) for i in range(len(tokens) - 1)]
        bigram_texts.append(' '.join(bigrams))
    return bigram_texts

# Apply bigram creation on the dataset
bigram_reviews = create_bigrams(data_df['Review'].tolist())

# Tokenizer for DistilBERT
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Tokenizing the dataset
class ReviewsDataset(Dataset):
    def __init__(self, reviews, labels):
        self.encodings = tokenizer(reviews, truncation=True, padding=True, max_length=512)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Prepare the dataset with bigrams
labels = data_df['Label'].tolist()
dataset = ReviewsDataset(bigram_reviews, labels)

# Split the dataset into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# DistilBERT model
db_model2 = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Trainer
trainer2 = Trainer(
    model=db_model2,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer2.train()

from sklearn.metrics import classification_report, accuracy_score

# Make predictions on the test set
predictions = trainer2.predict(val_dataset)

# Predictions are in the logits format, so convert them to class predictions
preds = np.argmax(predictions.predictions, axis=-1)

# True labels
true_labels = predictions.label_ids

# Calculate accuracy
accuracy = accuracy_score(true_labels, preds)

# Detailed classification report
class_report = classification_report(true_labels, preds)

print("Accuracy:", accuracy)
print("Classification Report:\n", class_report)



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        20
           1       1.00      1.00      1.00        20

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40



In [19]:
####Trigram Baseline 1: Logistic Regression

import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Function to generate synthetic review using trigrams
def generate_synthetic_review_trigram(trigram_counts, num_words=50):
    if not trigram_counts:
        return ""
    
    # Start with a random bigram
    start_bigram = random.choice(list(trigram_counts.keys()))
    review = list(start_bigram)

    for _ in range(num_words - 2):
        next_words = list(trigram_counts[start_bigram].keys())
        if not next_words:
            break
        next_word_weights = list(trigram_counts[start_bigram].values())
        next_word = random.choices(next_words, weights=next_word_weights)[0]
        review.append(next_word)
        start_bigram = (start_bigram[1], next_word)

    return ' '.join(review)

# Generate synthetic reviews using trigram model
num_synthetic_reviews = 100  # adjust as needed
synthetic_reviews_trigram = [generate_synthetic_review_trigram(trigram_counts) for _ in range(num_synthetic_reviews)]
actual_reviews = reviews_df['Review'].sample(num_synthetic_reviews).tolist()

# Combine and label data
combined_reviews = synthetic_reviews_trigram + actual_reviews
labels = [0] * len(synthetic_reviews_trigram) + [1] * len(actual_reviews)  # 0 for synthetic, 1 for actual

# Create a DataFrame
data_df = pd.DataFrame({'Review': combined_reviews, 'Label': labels})


# Combine and label data
combined_reviews = synthetic_reviews + actual_reviews
labels = [0] * len(synthetic_reviews) + [1] * len(actual_reviews)  # 0 for synthetic, 1 for actual

# Create a DataFrame
data_df = pd.DataFrame({'Review': combined_reviews, 'Label': labels})

# Combine synthetic and actual reviews
combined_reviews = synthetic_reviews_trigram + actual_reviews  # Assuming 'actual_reviews' is already defined

# Using Bag-of-Words model with trigrams
vectorizer = CountVectorizer(ngram_range=(3, 3))
X = vectorizer.fit_transform(combined_reviews)
y = data_df['Label']  # Assuming 'data_df' is the DataFrame with labels

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Logistic Regression model
lr_model3 = LogisticRegression()
lr_model3.fit(X_train, y_train)

# Evaluate the model
predictions = lr_model3.predict(X_test)
print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

           0       0.54      1.00      0.70        21
           1       1.00      0.05      0.10        19

    accuracy                           0.55        40
   macro avg       0.77      0.53      0.40        40
weighted avg       0.76      0.55      0.42        40



In [20]:
####Trigram Baseline 2: Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Function to generate synthetic review using trigrams
def generate_synthetic_review_trigram(trigram_counts, num_words=50):
    if not trigram_counts:
        return ""
    
    start_bigram = random.choice(list(trigram_counts.keys()))
    review = list(start_bigram)

    for _ in range(num_words - 2):
        next_words = list(trigram_counts[start_bigram].keys())
        if not next_words:
            break
        next_word_weights = list(trigram_counts[start_bigram].values())
        next_word = random.choices(next_words, weights=next_word_weights)[0]
        review.append(next_word)
        start_bigram = (start_bigram[1], next_word)

    return ' '.join(review)

# Generate synthetic reviews using trigram model
num_synthetic_reviews = 100
synthetic_reviews_trigram = [generate_synthetic_review_trigram(trigram_counts) for _ in range(num_synthetic_reviews)]

# Combine synthetic and actual reviews
combined_reviews = synthetic_reviews_trigram + actual_reviews  # Assuming 'actual_reviews' is already defined

# Using Bag-of-Words model with trigrams
vectorizer = CountVectorizer(ngram_range=(3, 3))
X = vectorizer.fit_transform(combined_reviews)
y = data_df['Label']  # Assuming 'data_df' is the DataFrame with labels

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest model
rf_model3 = RandomForestClassifier()
rf_model3.fit(X_train, y_train)

# Evaluate the model
predictions = rf_model3.predict(X_test)
print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

           0       0.54      1.00      0.70        21
           1       1.00      0.05      0.10        19

    accuracy                           0.55        40
   macro avg       0.77      0.53      0.40        40
weighted avg       0.76      0.55      0.42        40



In [21]:
####Trigram Baseline 3: Feed Forward
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Function to create trigrams
def create_trigrams(texts):
    trigram_texts = []
    for text in texts:
        tokens = text.split()
        trigrams = [' '.join(tokens[i:i+3]) for i in range(len(tokens) - 2)]
        trigram_texts.append(' '.join(trigrams))
    return trigram_texts

# Apply trigram creation on the dataset
trigram_reviews = create_trigrams(data_df['Review'].tolist())

# Define parameters
vocab_size = 10000  # Adjust as needed
embedding_dim = 16
max_length = 300  # Adjusted for trigrams
padding_type = 'post'
trunc_type = 'post'
oov_tok = "<OOV>"

# Tokenize and pad the reviews with trigrams
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(trigram_reviews)
sequences = tokenizer.texts_to_sequences(trigram_reviews)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Prepare the labels
labels = data_df['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded, labels, test_size=0.2, random_state=42)

# Build the model
fnn_model3 = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    GlobalAveragePooling1D(),
    Dense(24, activation='relu'),
    Dense(1, activation='sigmoid')
])

fnn_model3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
epochs = 10  # Adjust as needed
batch_size = 32  # Adjust as needed

fnn_model3.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), verbose=1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x444a39bd0>

In [22]:
####Trigram Baseline 4: Recurrent Neural Network
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Function to create trigrams
def create_trigrams(texts):
    trigram_texts = []
    for text in texts:
        tokens = text.split()
        trigrams = [' '.join(tokens[i:i+3]) for i in range(len(tokens) - 2)]
        trigram_texts.append(' '.join(trigrams))
    return trigram_texts

# Apply trigram creation on the dataset
trigram_reviews = create_trigrams(data_df['Review'].tolist())

# Define parameters
vocab_size = 10000  # Adjust as needed
embedding_dim = 16
max_length = 300  # Adjusted for trigrams
padding_type = 'post'
trunc_type = 'post'
oov_tok = "<OOV>"
lstm_units = 32  # Number of LSTM units

# Tokenize and pad the reviews with trigrams
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(trigram_reviews)
sequences = tokenizer.texts_to_sequences(trigram_reviews)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Prepare the labels
labels = data_df['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded, labels, test_size=0.2, random_state=42)

# Build the RNN model
rnn_model3 = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    LSTM(lstm_units),
    Dense(24, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

rnn_model3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
epochs = 10  # Adjust as needed
batch_size = 32  # Adjust as needed

rnn_model3.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), verbose=1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x4461109d0>

In [23]:
####Trigram Baseline 5: DistilBERT

from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch

# Function to create trigrams
def create_trigrams(texts):
    trigram_texts = []
    for text in texts:
        tokens = text.split()
        trigrams = [' '.join(tokens[i:i+3]) for i in range(len(tokens) - 2)]
        trigram_texts.append(' '.join(trigrams))
    return trigram_texts

# Apply trigram creation on the dataset
trigram_reviews = create_trigrams(data_df['Review'].tolist())

# Tokenizer for DistilBERT
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Tokenizing the dataset
class ReviewsDataset(Dataset):
    def __init__(self, reviews, labels):
        self.encodings = tokenizer(reviews, truncation=True, padding=True, max_length=512)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Prepare the dataset with trigrams
labels = data_df['Label'].tolist()
dataset = ReviewsDataset(trigram_reviews, labels)

# Split the dataset into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# DistilBERT model
db_model3 = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Trainer
trainer3 = Trainer(
    model=db_model3,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer3.train()

from sklearn.metrics import classification_report, accuracy_score

# Make predictions on the test set
predictions = trainer3.predict(val_dataset)

# Predictions are in the logits format, so convert them to class predictions
preds = np.argmax(predictions.predictions, axis=-1)

# True labels
true_labels = predictions.label_ids

# Calculate accuracy
accuracy = accuracy_score(true_labels, preds)

# Detailed classification report
class_report = classification_report(true_labels, preds)

print("Accuracy:", accuracy)
print("Classification Report:\n", class_report)



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        20
           1       1.00      1.00      1.00        20

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40

