In [None]:
import pandas as pd
import re
from collections import Counter, defaultdict
from nltk.util import ngrams
import random


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


In [None]:
cd drive/MyDrive/

/content/drive/MyDrive


In [None]:
#Acquire the dataset from AllReviews csv file and clean it up to a usable form
reviews_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AllReviews.csv')
reviews_df = reviews_df.loc[(reviews_df["titleType"] == "tvMovie")|(reviews_df["titleType"] == "movie")]

def simple_tokenizer(text): #
    text = str(text)
    if text == 'nan':
        return []
    text = re.sub(r'<[^>]+>', ' ', text)  # Remove HTML tags. Unlikely needed although a double check wouldn't hurt.
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)  # Filter to allow only alphabet letters
    text = text.lower()  # Convert to lower case
    # Tokenize by splitting the sentences into words
    tokens = text.split()
    return tokens

# Tokenize the reviews. Replace 'Review' column depending on.
reviews_df['Review'] = reviews_df['Review'].astype(str)
tokenized_reviews = reviews_df['Review'].apply(simple_tokenizer)
flat_token_list = [token for sublist in tokenized_reviews for token in sublist]

unigrams = flat_token_list
bigrams = list(ngrams(flat_token_list, 2))
trigrams = list(ngrams(flat_token_list, 3))

# Count the frequencies of each n-gram.
unigram_counts = Counter(unigrams)
bigram_counts = Counter(bigrams)
trigram_counts = Counter(trigrams)

# Display the most common n-grams. We expect words like "the" and "a" to be the most common.
print('Most common unigrams:', unigram_counts.most_common(5))
print('Most common bigrams:', bigram_counts.most_common(5))
print('Most common trigrams:', trigram_counts.most_common(5))


Most common unigrams: [('the', 1072757), ('a', 526387), ('and', 519940), ('of', 469101), ('to', 431304)]
Most common bigrams: [(('of', 'the'), 124622), (('in', 'the'), 81830), (('this', 'movie'), 53054), (('the', 'film'), 44411), (('is', 'a'), 43293)]
Most common trigrams: [(('one', 'of', 'the'), 15740), (('this', 'movie', 'is'), 8874), (('of', 'the', 'film'), 8617), (('this', 'is', 'a'), 7884), (('a', 'lot', 'of'), 7409)]


In [None]:
bigram_counts = defaultdict(Counter)

for w1, w2 in bigrams:
    bigram_counts[w1][w2] += 1

# Convert the counts to probabilities for the bigram model
for w1 in bigram_counts:
    total_count = float(sum(bigram_counts[w1].values()))
    for w2 in bigram_counts[w1]:
        bigram_counts[w1][w2] /= total_count

In [None]:
trigram_counts = defaultdict(Counter)

for w1, w2, w3 in trigrams:
    trigram_counts[(w1, w2)][w3] += 1

for w1_w2 in trigram_counts:
    total_count = sum(trigram_counts[w1_w2].values())
    for w3 in trigram_counts[w1_w2]:
        trigram_counts[w1_w2][w3] /= total_count


In [None]:
num_synthetic_reviews = 5000  # adjust as needed

In [None]:
def generate_synthetic_review_unigram(unigram_counts, num_words=50):
    words = list(unigram_counts.keys())
    word_probabilities = [unigram_counts[word] for word in words]
    review = [random.choices(words, weights=word_probabilities)[0] for _ in range(num_words)]
    return ' '.join(review)

synthetic_reviews = [generate_synthetic_review_unigram(unigram_counts) for _ in range(num_synthetic_reviews)]

unigram_reviews = pd.DataFrame({'review': synthetic_reviews, 'model': 'unigram'})
unigram_reviews.to_csv('/content/drive/MyDrive/results/unigram_reviews.csv', index=False)

In [None]:
def generate_synthetic_review_bigram(bigram_counts, num_words=50):
    if not bigram_counts:
        return ""

    # Start with a random word
    current_word = random.choice(list(bigram_counts.keys()))
    review = [current_word]

    for _ in range(num_words - 1):
        next_words = list(bigram_counts[current_word].keys())
        next_word_weights = list(bigram_counts[current_word].values())
        next_word = random.choices(next_words, weights=next_word_weights)[0]
        review.append(next_word)
        current_word = next_word

    return ' '.join(review)

# Generate synthetic reviews using bigram model
synthetic_reviews_bigram = [generate_synthetic_review_bigram(bigram_counts) for _ in range(num_synthetic_reviews)]

bigram_reviews = pd.DataFrame({'review': synthetic_reviews_bigram, 'model': 'bigram'})
bigram_reviews.to_csv('/content/drive/MyDrive/results/bigram_reviews.csv', index=False)

In [None]:
def generate_synthetic_review_trigram(trigram_counts, num_words=50):
    if not trigram_counts:
        return ""

    # Start with a random bigram
    start_bigram = random.choice(list(trigram_counts.keys()))
    review = list(start_bigram)

    for _ in range(num_words - 2):
        next_words = list(trigram_counts[start_bigram].keys())
        if not next_words:
            break
        next_word_weights = list(trigram_counts[start_bigram].values())
        next_word = random.choices(next_words, weights=next_word_weights)[0]
        review.append(next_word)
        start_bigram = (start_bigram[1], next_word)

    return ' '.join(review)

# Generate synthetic reviews using trigram model
synthetic_reviews_trigram = [generate_synthetic_review_trigram(trigram_counts) for _ in range(num_synthetic_reviews)]

trigram_reviews = pd.DataFrame({'review': synthetic_reviews_trigram, 'model': 'trigram'})
trigram_reviews.to_csv('/content/drive/MyDrive/results/trigram_reviews.csv', index=False)