In [None]:
import pandas as pd
import re
from collections import Counter, defaultdict
from nltk.util import ngrams

reviews_df = pd.read_csv('/Users/mazinrafi/Downloads/AllReviews.csv')

def simple_tokenizer(text): #
    text = str(text)
    if text == 'nan':
        return [] 
    text = re.sub(r'<[^>]+>', ' ', text)  # Remove HTML tags. Unlikely needed although a double check wouldn't hurt. 
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)  # Filter to allow only alphabet letters
    text = text.lower()  # Convert to lower case
    # Tokenize by splitting the sentences into words
    tokens = text.split()
    return tokens

# Tokenize the reviews. Replace 'Review' column depending on. 
reviews_df['Review'] = reviews_df['Review'].astype(str)
tokenized_reviews = reviews_df['Review'].apply(simple_tokenizer)
flat_token_list = [token for sublist in tokenized_reviews for token in sublist]

# Generate unigrams, bigrams, and trigrams from the flattened token list
unigrams = flat_token_list
bigrams = list(ngrams(flat_token_list, 2))
trigrams = list(ngrams(flat_token_list, 3))

# Count the frequencies of each n-gram. 
unigram_counts = Counter(unigrams)
bigram_counts = Counter(bigrams)
trigram_counts = Counter(trigrams)

# Display the most common n-grams. We expect words like "the" and "a" to be the most common. 
print('Most common unigrams:', unigram_counts.most_common(5))
print('Most common bigrams:', bigram_counts.most_common(5))
print('Most common trigrams:', trigram_counts.most_common(5))


In [2]:
import random
# Function to generate a sentence using the unigram model
def generate_sentence_unigram(unigram_counts, num_words=10): #change number accordingly
    # Select num_words words based on their frequency probability distribution
    words = [word for word in unigram_counts.keys()]
    word_probabilities = [unigram_counts[word] for word in words]
    generated_words = [random.choices(words, weights=word_probabilities)[0] for _ in range(num_words)]
    return ' '.join(generated_words)



#Generate sentences using a unigram model. 
print("Unigram model generated sentence:")
print(generate_sentence_unigram(unigram_counts))


Unigram model generated sentence:
me after a by dont go the was of taking


In [3]:
bigram_counts = defaultdict(Counter)

for w1, w2 in bigrams:
    bigram_counts[w1][w2] += 1

# Convert the counts to probabilities for the bigram model
for w1 in bigram_counts:
    total_count = float(sum(bigram_counts[w1].values()))
    for w2 in bigram_counts[w1]:
        bigram_counts[w1][w2] /= total_count

def generate_sentence_bigram(bigram_counts, seed_word, num_words=10):
    current_word = seed_word
    sentence = [current_word]
    for _ in range(num_words - 1):  # already have seed word
        next_words = list(bigram_counts[current_word].keys())
        if not next_words:
            break
        next_word_weights = list(bigram_counts[current_word].values())
        next_word = random.choices(next_words, weights=next_word_weights)[0]
        sentence.append(next_word)
        current_word = next_word
    return ' '.join(sentence)

# Generate a sentence using the bigram model with a seed word
seed_word = 'great'  
generated_sentence = generate_sentence_bigram(bigram_counts, seed_word)
generated_sentence

#expected to output different results. 

'great outdoors beautifully slinky figure out of course not empty'

In [4]:
trigram_counts = defaultdict(Counter)

for w1, w2, w3 in trigrams:
    trigram_counts[(w1, w2)][w3] += 1

for w1_w2 in trigram_counts:
    total_count = sum(trigram_counts[w1_w2].values())
    for w3 in trigram_counts[w1_w2]:
        trigram_counts[w1_w2][w3] /= total_count

def generate_sentence_trigram(trigram_counts, start_bigram, num_words=10):
    if start_bigram not in trigram_counts:
        return ' '.join(start_bigram)

    current_bigram = start_bigram
    sentence = [current_bigram[0], current_bigram[1]]
    for _ in range(num_words - 2):  # minus 2 because we already have the start_bigram
        next_words = list(trigram_counts[current_bigram].keys())
        weights = list(trigram_counts[current_bigram].values())
        next_word = random.choices(next_words, weights=weights)[0]
        sentence.append(next_word)
        current_bigram = (current_bigram[1], next_word)

    return ' '.join(sentence)

# Choose a random start bigram
start_bigram = random.choice(list(trigram_counts.keys())) #Alternatively start_bigram=('word1','word2')
print("Trigram model generated sentence starting with bigram '{} {}':".format(*start_bigram))
print(generate_sentence_trigram(trigram_counts, start_bigram))


Trigram model generated sentence starting with bigram 'bands early':
bands early stuff and is there a single revision obviously


In [5]:
from math import pow, log
sentence = "I don't think this movie is that good." #Need a longer example to test.
test_data = simple_tokenizer(sentence)
# Function to calculate perplexity for unigram model
def calculate_perplexity_unigram(test_data, unigram_counts, total_unigrams):
    perplexity = 1
    N = 0
    for word in test_data:
        N += 1
        probability = unigram_counts.get(word, 0) / total_unigrams
        if probability > 0:
            perplexity = perplexity * (1 / probability)
    perplexity = pow(perplexity, 1/float(N))
    return perplexity

# Total number of unigrams (needed for unigram perplexity calculation)
total_unigrams = sum(unigram_counts.values())

# Calculating perplexity for unigram model
perplexity_unigram = calculate_perplexity_unigram(test_data, unigram_counts, total_unigrams)
perplexity_unigram


171.13008963881836

In [6]:
# Bigram Perplexity Calculation
def calculate_perplexity_bigram(test_data, bigram_counts):
    perplexity = 1
    N = 0
    for i in range(len(test_data) - 1):
        N += 1
        bigram = (test_data[i], test_data[i + 1])
        bigram_probability = bigram_counts[test_data[i]].get(test_data[i + 1], 0)
        if bigram_probability > 0:
            perplexity = perplexity * (1 / bigram_probability)
        else:
            perplexity = perplexity * (1 / total_unigrams)  # Smoothing for unseen bigrams
    perplexity = pow(perplexity, 1/float(N - 1))
    return perplexity

# Calculating perplexity for bigram model
perplexity_bigram = calculate_perplexity_bigram(test_data, bigram_counts)
perplexity_bigram


42.086343146031055

In [7]:
# Function to calculate perplexity for trigram model
def calculate_perplexity_trigram(test_data, trigram_counts):
    perplexity = 1
    N = 0
    for i in range(len(test_data) - 2):
        N += 1
        trigram = (test_data[i], test_data[i + 1], test_data[i + 2])
        trigram_probability = trigram_counts[(test_data[i], test_data[i + 1])].get(test_data[i + 2], 0)
        if trigram_probability > 0:
            perplexity = perplexity * (1 / trigram_probability)
        else:
            perplexity = perplexity * (1 / total_unigrams)  # Smoothing for unseen trigrams
    perplexity = pow(perplexity, 1/float(N - 2))
    return perplexity
# Calculating perplexity for trigram model
perplexity_trigram = calculate_perplexity_trigram(test_data, trigram_counts)
perplexity_trigram


66.76777564595017

In [8]:
print(perplexity_unigram)
print(perplexity_bigram)
print(perplexity_trigram)

171.13008963881836
42.086343146031055
66.76777564595017


In [9]:
import random

def generate_synthetic_review_unigram(unigram_counts, num_words=50):
    words = list(unigram_counts.keys())
    word_probabilities = [unigram_counts[word] for word in words]
    review = [random.choices(words, weights=word_probabilities)[0] for _ in range(num_words)]
    return ' '.join(review)

# Generate synthetic reviews
num_synthetic_reviews = 100  # adjust as needed
synthetic_reviews = [generate_synthetic_review_unigram(unigram_counts) for _ in range(num_synthetic_reviews)]


In [10]:
# Assuming reviews_df is your DataFrame containing actual reviews
actual_reviews = reviews_df['Review'].sample(num_synthetic_reviews).tolist()


In [11]:
import pandas as pd

# Combine and label data
combined_reviews = synthetic_reviews + actual_reviews
labels = [0] * len(synthetic_reviews) + [1] * len(actual_reviews)  # 0 for synthetic, 1 for actual

# Create a DataFrame
data_df = pd.DataFrame({'Review': combined_reviews, 'Label': labels})


In [12]:
from sklearn.feature_extraction.text import CountVectorizer

# Using Bag-of-Words model
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data_df['Review'])
y = data_df['Label']


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
lr_model1 = LogisticRegression()
lr_model1.fit(X_train, y_train)

# Evaluate the model
predictions = lr_model1.predict(X_test)
print(classification_report(y_test, predictions))



In [14]:
# Evaluate the model
predictions = lr_model1.predict(X_test)
print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

           0       0.81      1.00      0.89        21
           1       1.00      0.74      0.85        19

    accuracy                           0.88        40
   macro avg       0.90      0.87      0.87        40
weighted avg       0.90      0.88      0.87        40



In [15]:
import numpy as np

# Generate random indices for selecting sample reviews
random_indices = np.random.choice(X_test.shape[0], 5, replace=False)

# Extract sample reviews and their actual labels
sample_reviews = X_test[random_indices]
actual_labels_sample = y_test.iloc[random_indices]

# Predict the labels for these reviews
predicted_labels = lr_model1.predict(sample_reviews)

# Display the results
for i, index in enumerate(random_indices):
    review_features = sample_reviews[i]
    actual_label = actual_labels_sample.iloc[i]
    predicted_label = predicted_labels[i]

    # Transforming the review features back to text (may not perfectly reconstruct the original text)
    review_text = ' '.join(vectorizer.inverse_transform(review_features)[0])
    
    print("Review:", review_text)
    print("Actual Label:", "Actual" if actual_label == 1 else "Synthetic")
    print("Predicted Label:", "Actual" if predicted_label == 1 else "Synthetic")
    print("-" * 50)


Review: and the out he one that even you in just was as of read such role seen michael check dr audience find plain lots acting each entitled brief sleaze guests rock ruth gift wants inner benefited ferrell joshs daring nunez glorified ripples remember user
Actual Label: Synthetic
Predicted Label: Synthetic
--------------------------------------------------
Review: and this the it to is old could for on was be am much couldnt story part an chupacabra killer get editor another anywhere showing stella evaluate whats want disney wild right directed tension harry becomes
Actual Label: Synthetic
Predicted Label: Synthetic
--------------------------------------------------
Review: this the all its to one think really that with in but by was of worst his about what did using most an ending including line depth main acting wacko horrible unusual brian casablanca screenwriters became latifah white remotely recycled forgetting
Actual Label: Synthetic
Predicted Label: Synthetic
------------------

In [16]:
def generate_synthetic_review_bigram(bigram_counts, num_words=50):
    if not bigram_counts:
        return ""
    
    # Start with a random word
    current_word = random.choice(list(bigram_counts.keys()))
    review = [current_word]

    for _ in range(num_words - 1):
        next_words = list(bigram_counts[current_word].keys())
        next_word_weights = list(bigram_counts[current_word].values())
        next_word = random.choices(next_words, weights=next_word_weights)[0]
        review.append(next_word)
        current_word = next_word

    return ' '.join(review)

# Generate synthetic reviews using bigram model
num_synthetic_reviews = 100  # adjust as needed
synthetic_reviews_bigram = [generate_synthetic_review_bigram(bigram_counts) for _ in range(num_synthetic_reviews)]


In [17]:
from sklearn.feature_extraction.text import CountVectorizer

# Using Bag-of-Words model with bigrams
vectorizer = CountVectorizer(ngram_range=(2, 2))
X = vectorizer.fit_transform(combined_reviews)
y = data_df['Label']

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
lr_model2 = LogisticRegression()
lr_model2.fit(X_train, y_train)

# Evaluate the model
predictions = lr_model2.predict(X_test)
print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

           0       0.75      1.00      0.86        21
           1       1.00      0.63      0.77        19

    accuracy                           0.82        40
   macro avg       0.88      0.82      0.82        40
weighted avg       0.87      0.82      0.82        40



In [18]:
# Display a few classified reviews (adapted for bigram features)
random_indices = np.random.choice(X_test.shape[0], 5, replace=False)
sample_reviews = X_test[random_indices]
actual_labels_sample = y_test.iloc[random_indices]
predicted_labels = lr_model2.predict(sample_reviews)

for i, index in enumerate(random_indices):
    review_features = sample_reviews[i]
    actual_label = actual_labels_sample.iloc[i]
    predicted_label = predicted_labels[i]
    review_text = ' '.join(vectorizer.inverse_transform(review_features)[0])
    print("Review:", review_text)
    print("Actual Label:", "Actual" if actual_label == 1 else "Synthetic")
    print("Predicted Label:", "Actual" if predicted_label == 1 else "Synthetic")
    print("-" * 50)


Review: is the and the it and the best the story for the of the that it the acting from the has the the special br br is just this film if you they will don put film has and don the rest rest of read the special effects the actors which is the original really bad through the not be the novel but that will not are decent save your and john bad if you read the books books do do yourself yourself favor favor and put yourself yourself through the agony agony of of sitting sitting through through this this travesty travesty the story line line which which skips skips about about 70 70 of original story line wanders wanders miles miles from novel except except for for rachel rachel weisz weisz great great voice voice over over work work for the dragon dragon which best acting acting this acting from actors is just above above high high school school play play level level what what were were jeremy jeremy and john thinking thinking in in taking taking these these roles roles the effects are d

In [19]:
def generate_synthetic_review_trigram(trigram_counts, num_words=50):
    if not trigram_counts:
        return ""
    
    # Start with a random bigram
    start_bigram = random.choice(list(trigram_counts.keys()))
    review = list(start_bigram)

    for _ in range(num_words - 2):
        next_words = list(trigram_counts[start_bigram].keys())
        if not next_words:
            break
        next_word_weights = list(trigram_counts[start_bigram].values())
        next_word = random.choices(next_words, weights=next_word_weights)[0]
        review.append(next_word)
        start_bigram = (start_bigram[1], next_word)

    return ' '.join(review)

# Generate synthetic reviews using trigram model
num_synthetic_reviews = 100  # adjust as needed
synthetic_reviews_trigram = [generate_synthetic_review_trigram(trigram_counts) for _ in range(num_synthetic_reviews)]

from sklearn.feature_extraction.text import CountVectorizer

# Using Bag-of-Words model with trigrams (can change later)
vectorizer = CountVectorizer(ngram_range=(3, 3))
X = vectorizer.fit_transform(combined_reviews)
y = data_df['Label']

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
lr_model3 = LogisticRegression()
lr_model3.fit(X_train, y_train)

# Evaluate the model
predictions = lr_model3.predict(X_test)
print(classification_report(y_test, predictions))



              precision    recall  f1-score   support

           0       0.54      1.00      0.70        21
           1       1.00      0.05      0.10        19

    accuracy                           0.55        40
   macro avg       0.77      0.53      0.40        40
weighted avg       0.76      0.55      0.42        40



In [20]:
# Display a few classified reviews (adapted for bigram features)
random_indices = np.random.choice(X_test.shape[0], 5, replace=False)
sample_reviews = X_test[random_indices]
actual_labels_sample = y_test.iloc[random_indices]
predicted_labels = lr_model3.predict(sample_reviews)

for i, index in enumerate(random_indices):
    review_features = sample_reviews[i]
    actual_label = actual_labels_sample.iloc[i]
    predicted_label = predicted_labels[i]
    review_text = ' '.join(vectorizer.inverse_transform(review_features)[0])
    print("Review:", review_text)
    print("Actual Label:", "Actual" if actual_label == 1 else "Synthetic")
    print("Predicted Label:", "Actual" if predicted_label == 1 else "Synthetic")
    print("-" * 50)


Review: enough was she was she an she an to an to also to also feel also feel his feel his may his may key may key resnais key resnais in resnais in who in who its who its with its with progress with progress in progress in by in by but by but movie but movie of movie of tears of tears sister tears sister very sister very movie very movie in movie in adults in adults had adults had have had have who have who distorted who distorted exchange distorted exchange skills exchange skills film skills film chance film chance if chance if to if to it to it apparently it apparently well apparently well the well the and the and great and great look great look fine look fine an fine an tv an tv nice tv nice party
Actual Label: Synthetic
Predicted Label: Synthetic
--------------------------------------------------
Review: this chupacabra the chupacabra the killer the killer to killer to the to the get the get the get the am the am editor am editor old editor old was old was another was another anyw

In [21]:
####Baseline 2: Random Forest
#Unigram
import pandas as pd
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


num_synthetic_reviews = 100
synthetic_reviews_unigram = [generate_synthetic_review_unigram(unigram_counts) for _ in range(num_synthetic_reviews)]

# Prepare actual reviews
actual_reviews = reviews_df['Review'].sample(num_synthetic_reviews).tolist()

# Combine and label data
combined_reviews = synthetic_reviews_unigram + actual_reviews
labels = [0] * len(synthetic_reviews_unigram) + [1] * len(actual_reviews)

# Create DataFrame for combined data
data_df = pd.DataFrame({'Review': combined_reviews, 'Label': labels})

# Feature extraction using Bag-of-Words
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data_df['Review'])
y = data_df['Label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest model
rf_model1 = RandomForestClassifier()
rf_model1.fit(X_train, y_train)

# Evaluate the model
predictions = rf_model1.predict(X_test)
print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

           0       0.88      1.00      0.93        21
           1       1.00      0.84      0.91        19

    accuracy                           0.93        40
   macro avg       0.94      0.92      0.92        40
weighted avg       0.93      0.93      0.92        40



In [22]:
import numpy as np

# Generate random indices for selecting sample reviews
random_indices = np.random.choice(X_test.shape[0], 5, replace=False)

# Extract sample reviews and their actual labels
sample_reviews = X_test[random_indices]
actual_labels_sample = y_test.iloc[random_indices]

# Predict the labels for these reviews
predicted_labels = rf_model1.predict(sample_reviews)

# Display the results
for i, index in enumerate(random_indices):
    review_features = sample_reviews[i]
    actual_label = actual_labels_sample.iloc[i]
    predicted_label = predicted_labels[i]

    # Transforming the review features back to text (may not perfectly reconstruct the original text)
    review_text = ' '.join(vectorizer.inverse_transform(review_features)[0])
    
    print("Review:", review_text)
    print("Actual Label:", "Actual" if actual_label == 1 else "Synthetic")
    print("Predicted Label:", "Actual" if predicted_label == 1 else "Synthetic")
    print("-" * 50)


Review: add and any are as been chance characters complete concerning explanation finds get growls is less locations looks million missed nerves of out over part real roles seemed she so spice stars thames that the there this to two useless was when word worse
Actual Label: Synthetic
Predicted Label: Synthetic
--------------------------------------------------
Review: 10 30 about actors actresses always and as at back be berkeley best better blondell bought branches bread busby by cagney cast cents choreographer conditions day did during economic end enthusiasm fabulous feel for forget frank gloomy great guy hollywood how in it its itself james joan keeler kibbee knowing little ll loaf luxury make many marching mchugh military millions minor movie movies must note of patriotism people powell precision remember respected ruby scraped see similar so studio system testimony that the then this time to together too type usa use viewed was were william with you
Actual Label: Actual
Predicted

In [23]:
#Bigram
import pandas as pd
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Function to generate synthetic review using bigram model
def generate_synthetic_review_bigram(bigram_counts, num_words=50):
    if not bigram_counts:
        return ""
    
    current_word = random.choice(list(bigram_counts.keys()))
    review = [current_word]

    for _ in range(num_words - 1):
        next_words = list(bigram_counts[current_word].keys())
        next_word_weights = list(bigram_counts[current_word].values())
        next_word = random.choices(next_words, weights=next_word_weights)[0]
        review.append(next_word)
        current_word = next_word

    return ' '.join(review)

# Generate synthetic reviews using bigram model
num_synthetic_reviews = 100
synthetic_reviews_bigram = [generate_synthetic_review_bigram(bigram_counts) for _ in range(num_synthetic_reviews)]

# Prepare actual reviews
actual_reviews = reviews_df['Review'].sample(num_synthetic_reviews).tolist()

# Combine and label data
combined_reviews = synthetic_reviews_bigram + actual_reviews
labels = [0] * len(synthetic_reviews_bigram) + [1] * len(actual_reviews)

# Create DataFrame for combined data
data_df = pd.DataFrame({'Review': combined_reviews, 'Label': labels})

# Feature extraction using Bag-of-Words with bigrams
vectorizer = CountVectorizer(ngram_range=(2, 2))
X = vectorizer.fit_transform(data_df['Review'])
y = data_df['Label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest model for bigram reviews
rf_model2 = RandomForestClassifier()
rf_model2.fit(X_train, y_train)

# Evaluate the model
predictions = rf_model2.predict(X_test)
print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

           0       0.66      1.00      0.79        21
           1       1.00      0.42      0.59        19

    accuracy                           0.73        40
   macro avg       0.83      0.71      0.69        40
weighted avg       0.82      0.72      0.70        40



In [24]:
# Display a few classified reviews (adapted for bigram features)
random_indices = np.random.choice(X_test.shape[0], 5, replace=False)
sample_reviews = X_test[random_indices]
actual_labels_sample = y_test.iloc[random_indices]
predicted_labels = rf_model2.predict(sample_reviews)

for i, index in enumerate(random_indices):
    review_features = sample_reviews[i]
    actual_label = actual_labels_sample.iloc[i]
    predicted_label = predicted_labels[i]
    review_text = ' '.join(vectorizer.inverse_transform(review_features)[0])
    print("Review:", review_text)
    print("Actual Label:", "Actual" if actual_label == 1 else "Synthetic")
    print("Predicted Label:", "Actual" if predicted_label == 1 else "Synthetic")
    print("-" * 50)


Review: all the also knows and grumpy and is audiences will bad movie be disappointed believe he but it certainly have disappointed and grew certainly grumpy woman have the he grew himself and his problem hysterical its is also is using it self it would its possibly knows all movie but movie was movie when of it possibly result problem of result is sacrificing himself self sacrificing the bad the worst think that this movie to believe turtle when using his was hysterical when audiences when this will be woman to worst movie would think
Actual Label: Synthetic
Predicted Label: Synthetic
--------------------------------------------------
Review: 1993 shine all characters and direction and have and the and very approach bit aussie blend bit self blend of br br br gosh characters was cinematography and crowe talent depreciating and developing his didn russell direction pace even in felt this film throughout gosh didn have yet his relationship impressed with impressive br in 1993 in develop

In [25]:
# Import necessary libraries
import pandas as pd
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Function to generate synthetic review using trigram model
def generate_synthetic_review_trigram(trigram_counts, num_words=50):
    if not trigram_counts:
        return ""
    
    start_bigram = random.choice(list(trigram_counts.keys()))
    review = list(start_bigram)

    for _ in range(num_words - 2):
        next_words = list(trigram_counts[start_bigram].keys())
        if not next_words:
            break
        next_word_weights = list(trigram_counts[start_bigram].values())
        next_word = random.choices(next_words, weights=next_word_weights)[0]
        review.append(next_word)
        start_bigram = (start_bigram[1], next_word)

    return ' '.join(review)

# Generate synthetic reviews using trigram model
num_synthetic_reviews = 100
synthetic_reviews_trigram = [generate_synthetic_review_trigram(trigram_counts) for _ in range(num_synthetic_reviews)]

# Prepare actual reviews
actual_reviews = reviews_df['Review'].sample(num_synthetic_reviews).tolist()

# Combine and label data
combined_reviews = synthetic_reviews_trigram + actual_reviews
labels = [0] * len(synthetic_reviews_trigram) + [1] * len(actual_reviews)

# Create DataFrame for combined data
data_df = pd.DataFrame({'Review': combined_reviews, 'Label': labels})

# Feature extraction using Bag-of-Words with trigrams
vectorizer = CountVectorizer(ngram_range=(3, 3))
X = vectorizer.fit_transform(data_df['Review'])
y = data_df['Label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest model for trigram reviews
rf_model3 = RandomForestClassifier()
rf_model3.fit(X_train, y_train)

# Evaluate the model
predictions = rf_model3.predict(X_test)
print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

           0       0.57      1.00      0.72        21
           1       1.00      0.16      0.27        19

    accuracy                           0.60        40
   macro avg       0.78      0.58      0.50        40
weighted avg       0.77      0.60      0.51        40



In [26]:
# Display a few classified reviews (adapted for bigram features)
random_indices = np.random.choice(X_test.shape[0], 5, replace=False)
sample_reviews = X_test[random_indices]
actual_labels_sample = y_test.iloc[random_indices]
predicted_labels = rf_model3.predict(sample_reviews)

for i, index in enumerate(random_indices):
    review_features = sample_reviews[i]
    actual_label = actual_labels_sample.iloc[i]
    predicted_label = predicted_labels[i]
    review_text = ' '.join(vectorizer.inverse_transform(review_features)[0])
    print("Review:", review_text)
    print("Actual Label:", "Actual" if actual_label == 1 else "Synthetic")
    print("Predicted Label:", "Actual" if predicted_label == 1 else "Synthetic")
    print("-" * 50)


Review: and im sorry and starting war but thats not complimentary laptop per down to the effects stock footage fg visual effects footage of them has lived her her whole life heres the plot horrorfan like me im sorry but in real horrorfan jack kennedy in kennedy in real laptop per student life fg visual like me or lived her whole maybe this movie me or has movie and im not complimentary laptop of them were or has lived per student heres plot slows down real horrorfan like rounded up and slows down to sorry but thats starting war maybe stock footage of student heres the thats not complimentary the plot slows them were rounded this movie and up and starting visual effects stock war maybe this were rounded up whole life fg
Actual Label: Synthetic
Predicted Label: Synthetic
--------------------------------------------------
Review: abduct people and an extremely small and gets no and well for bounty hunter james budget pictures fake cast and well commercial stills gallery decide to abduct d