# Bag of words models

In this notebook we present the models that doesn't rely on the use of word embeddings. This part is composed of two main parts: the first part is made of retrieval based methods, while the second part is linear regression model.

**Retrieval-based approaches**

In [None]:
# Mount the drive folder, don't execute in your experiments
# import sys
# from google.colab import drive
# drive.mount('/content/drive/') 
# CWroot = "drive/MyDrive/NLP_CW/"

In [None]:
# Execute only if not connected to the drive folder
CWroot = "."

In [None]:
# Imports and instals
!pip install "nltk==3.4.5"

import pandas as pd
import nltk
import re
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from collections import Counter
import random
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
from sklearn.metrics import r2_score

nltk.download('punkt')
nltk.download('wordnet')

In [None]:
# Load data
train_df = pd.read_csv('%sdata/task-1/train.csv'%CWroot)
test_df = pd.read_csv('%sdata/task-1/dev.csv'%CWroot)

In [44]:
"""
Given a sentence with a token of the form <word/>, and a new_word, 
returns the preprocessed list of words with the new_word in place.
"""
def replace(sentence, new_word):
    l = sentence.split(' ')
    l = [word if not ("<" in word and "/>" in word) else new_word for word in l]
    sentence = ' '.join(l).lower().strip()
    sentence = re.sub(r'[^\w\s]','',sentence)
    tokens = word_tokenize(sentence)
    result = [i for i in tokens if not i in ENGLISH_STOP_WORDS]

    return result

In [None]:
# Changing format for convenience, train and test are lists of tuples of
# the form (sentence, score)
train = []

for sample in train_df.iloc:
    sentence = replace(sample['original'], sample['edit'])
    score = sample['meanGrade']
    train.append((sentence, score))

test = []

for sample in test_df.iloc:
    sentence = replace(sample['original'], sample['edit'])
    score = sample['meanGrade']
    test.append((sentence, score))

In [None]:
# Splitting train and dev set
random.shuffle(train)

train = train[:int(len(train)*0.9)]
dev = train[int(len(train)*0.9):]

In [None]:
"""
Given two headlines a and b, the cosine similarity between their 
BoW representations is returned
"""
def cosine_similarity(a, b):
    # count word occurrences
    a_vals = Counter(a)
    b_vals = Counter(b)

    # convert to word-vectors
    words = list(a_vals.keys() | b_vals.keys())
    a_vect = [a_vals.get(word, 0) for word in words] 
    b_vect = [b_vals.get(word, 0) for word in words] 

    # find cosine
    len_a = sum(av * av for av in a_vect) ** 0.5 
    len_b = sum(bv * bv for bv in b_vect) ** 0.5 
    dot = sum(av * bv for av, bv in zip(a_vect, b_vect)) 
    cosine = dot / (len_a * len_b) 
    return cosine

In [None]:
"""
Iterates through the training set for finding the N headlines that are
most similar by using cosine_similarity to the given sentence. 
Computes and returns the mean of the associated mean scores.
"""
def get_score(sentence, N):
    current_best_similarities = [(-1, 0) for _ in range(N)]
    for sentence_retrieved, score in train:
        new_sim_score = cosine_similarity(sentence, sentence_retrieved)
        for k in range(N):
            if new_sim_score > current_best_similarities[k][0]:
                current_best_similarities[k] = (new_sim_score, score)
                current_best_similarities.sort(key = lambda x: x[0], )
                break

    sum = 0

    for i in range(N):
        sum += current_best_similarities[i][1]


    return sum/N

In [None]:
"""
Iterates through the training set for finding the N headlines that are
most similar by using BLEU as measure of similarity to the given sentence. 
Computes and returns the mean of the associated mean scores.
"""
def get_score_by_bleu(sentence, N):
    current_best_similarities = [(-1, 0) for _ in range(N)]
    for sentence_retrieved, score in train[:int(len(train)*0.9)]:
        new_sim_score = sentence_bleu([sentence_retrieved], sentence)
        for k in range(N):
            if new_sim_score > current_best_similarities[k][0]:
                current_best_similarities[k] = (new_sim_score, score)
                current_best_similarities.sort(key = lambda x: x[0], )
                break

    sum = 0

    for i in range(N):
        sum += current_best_similarities[i][1]


    return sum/N

In [None]:
"""
Iterates through the training set for finding the N headlines that are
most similar by using METEOR as measure of similarity to the given sentence. 
Computes and returns the mean of the associated mean scores.
"""
def get_score_by_meteor(sentence, N):
    current_best_similarities = [(-1, 0) for _ in range(N)]
    for sentence_retrieved, score in train[:int(len(train)*0.9)]:
        new_sim_score = meteor_score([' '.join(sentence_retrieved)], sentence)
        for k in range(N):
            if new_sim_score > current_best_similarities[k][0]:
                current_best_similarities[k] = (new_sim_score, score)
                current_best_similarities.sort(key = lambda x: x[0], )
                break

    sum = 0

    for i in range(N):
        sum += current_best_similarities[i][1]


    return sum/N

In [None]:
# Computes the root mean square error between the predictions and the targets
def rmse(prediction, target):
    return mean_squared_error(target, prediction, squared=False)

In [None]:
total_rmse = 0
N = 5

rmses_cos = []
rmses_bleu = []
rmses_meteor = []

predictions = []
scores = []

for N in range(1,6):
    # Computing with METEOR
    total_mse = 0
    print("="*50)
    print("Experiments with K="+str(N))
    for i, (sentence, score) in enumerate(train[int(len(train)*0.9):]):
        pred_score = get_score_by_meteor(' '.join(sentence), N)
        predictions.append(pred_score)
        scores.append(score)

    res = rmse(predictions, scores)

    rmses_meteor.append(res)

    print("METEOR part completed.")
    
    # Computing with Cosine Similarity
    total_mse = 0
    for i, (sentence, score) in enumerate(train[int(len(train)*0.9):]):
        pred_score = get_score(sentence, N)
        predictions.append(pred_score)
        scores.append(score)

        if i % 500 == 0 and i != 0:
            print(f"Tested {i}/{len(train[int(len(train)*0.9):])}")

    res = rmse(predictions, scores)

    rmses_cos.append(res)

    print("Cosine Similarity part completed.")

    # Computing with BLEU
    total_mse = 0
    for i, (sentence, score) in enumerate(train[int(len(train)*0.9):]):
        pred_score = get_score_by_bleu(sentence, N)
        predictions.append(pred_score)
        scores.append(score)

        if i % 500 == 0 and i != 0:
            print(f"Tested {i}/{len(train[int(len(train)*0.9):])}")

    res = rmse(predictions, scores)

    rmses_bleu.append(res)

    print("BLEU part completed.")

In [None]:
plt.plot([1,2,3,4,5], rmses_cos, label="Cosine similarity")
plt.plot([1,2,3,4,5], rmses_bleu, label="BLEU")
plt.plot([1,2,3,4,5], rmses_meteor, label="METEOR")

**Linear regression with Bag of Words**

In [None]:
#imports
from collections import Counter
from sklearn.linear_model import LinearRegression

In [None]:
all_sentences = []

# Concatenate all sentences
for sentence, _ in train:
    all_sentences = [*all_sentences, *sentence]

# Make counter for all_sentences
vals = Counter(all_sentences)

words = []

# Get the list of the 300 most common words
for word, occ in vals.most_common(300):
    words.append(word)

# For each sample in the training set, build its BoW
# representation and store it in words_bow
words_bow = []
for i in range(len(train)):
    word_bow = [0 for w in words]
    sentence = train[i][0]
    for word in sentence:
        if word in words:
            idx = words.index(word)
            word_bow[idx] += 1

    words_bow.append(word_bow)

# Does the same for the dev set
words_bow_dev = []
for i in range(len(dev)):
    word_bow = [0 for w in words]
    sentence = dev[i][0]
    for word in sentence:
        if word in words:
            idx = words.index(word)
            word_bow[idx] += 1

    words_bow_dev.append(word_bow)

# Does the same for the test set
words_bow_test = []
for i in range(len(test)):
    word_bow = [0 for w in words]
    sentence = test[i][0]
    for word in sentence:
        if word in words:
            idx = words.index(word)
            word_bow[idx] += 1

    words_bow_test.append(word_bow)

In [None]:
# Function for training the model
def train_linear_regression(features, label):
    print ("Training the linear regression model...")
    ml_model = LinearRegression()
    ml_model.fit(features, label)
    print ('Finished')

    return ml_model

In [None]:
# Train the model
ml_model = train_linear_regression(words_bow, [y for x,y in train])

In [None]:
# Measure performance over dev set
res = ml_model.predict(words_bow_dev)
print(mean_squared_error([y for x,y in dev], res, squared=False))
print(r2_score([y for x,y in dev], res))

In [None]:
# Plots the distribution of outputs of this model
plt.hist(x=res, bins=10, alpha=0.7, rwidth=0.85, density=True)

In [None]:
# Retrain model by using train+dev sets
headlines = [*words_bow, *words_bow_dev]
y_train = [y for x,y in train]
y_dev = [y for x,y in dev]
y = [*y_train, *y_dev]

ml_model = train_linear_regression(headlines, y)

In [None]:
# Measure performance over test set
res = ml_model.predict(words_bow_test)
print(mean_squared_error([y for x,y in test], res, squared=False))
print(r2_score([y for x,y in test], res))

In [None]:
# Plots the distribution of outputs of this model
plt.hist(x=res, bins=10, alpha=0.7, rwidth=0.85, density=True)