<a href="https://colab.research.google.com/github/AUT-Student/IR-HW1/blob/main/IR_HW1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dataset

In [None]:
!gdown 1uJYlPDV4-V-hHQSIhlq1ruKBL6SR96Te
!unzip /content/IR_HW1_dataset.zip

Downloading...
From: https://drive.google.com/uc?id=1uJYlPDV4-V-hHQSIhlq1ruKBL6SR96Te
To: /content/IR_HW1_dataset.zip
  0% 0.00/1.76M [00:00<?, ?B/s]100% 1.76M/1.76M [00:00<00:00, 143MB/s]
Archive:  /content/IR_HW1_dataset.zip
   creating: Data/
  inflating: Data/test_data.csv      
  inflating: Data/train_data.csv     
  inflating: Data/valid_data.csv     


In [None]:
import pandas as pd
import numpy as np
import math

In [None]:
train_dataset = pd.read_csv("/content/Data/train_data.csv")
valid_dataset = pd.read_csv("/content/Data/valid_data.csv")
test_dataset  = pd.read_csv("/content/Data/test_data.csv")

# Preprocessing

In [None]:
import string
def remove_punctuation(text):
  output = ""
  for char in text:
    if char not in string.punctuation:
      output += char
  return output

In [None]:
def make_lower_case(text):
  return text.lower()

In [None]:
def preprocess(text):
  text = remove_punctuation(text)
  text = make_lower_case(text)
  return text

# TF-IDF

In [None]:
class TFIDF():
  def __init__(self, top_word_number, stop_word_number, preprocess):
    self.df_dictionary = dict()
    self.top_words = None
    self.document_number= None
    self.top_word_number = top_word_number
    self.stop_word_number = stop_word_number
    self.preprocess = preprocess 

  def process_corpus(self, dataset):
    self.document_number = len(dataset)
    self._calculate_df(dataset)
    self._select_top_words()
    self._calculate_idf()

  def _calculate_df(self, dataset):
    for data in dataset:
      text = self.preprocess(data)
      tokens = set(text.split())

      for token in tokens:
        if token not in self.df_dictionary:
          self.df_dictionary[token] = 0
        
        self.df_dictionary[token] += 1

  def _select_top_words(self):
    df_list = []

    for word in self.df_dictionary:
      df_list.append({"word": word, "df": self.df_dictionary[word]})

    df_list = sorted(df_list, key=lambda x:-x["df"])

    stop_words = set([item["word"] for item in df_list[:self.stop_word_number]])

    self.top_words = set([item["word"] for item in df_list[self.stop_word_number:self.stop_word_number + self.top_word_number]])
  
  def _calculate_idf(self):
    self.idf_dictionary = dict()

    for word in self.top_words:
      self.idf_dictionary[word] = math.log10(self.document_number / self.df_dictionary[word])

    self.word_id_dictionary = dict()

    for i, word in enumerate(self.top_words):
      self.word_id_dictionary[word] = i

  def vector(self, text):
    text = self.preprocess(text)
    words = text.split()

    tfidf_vector = np.zeros(self.top_word_number)

    tf_dictionary = dict()

    for word in words:
      if word not in self.top_words: continue
      if word not in tf_dictionary:
        tf_dictionary[word] = 0
      
      tf_dictionary[word] += 1
    
    for word in tf_dictionary:
      tf = 1 + math.log10(tf_dictionary[word])
      wid = self.word_id_dictionary[word]
      idf = self.idf_dictionary[word]

      tfidf_vector[wid] = tf * idf

    return tfidf_vector

In [None]:
from heapq import nlargest

class TFIDFRecommender():
  def __init__(self, tfidf, number_recommendation):
    self.tfidf = tfidf
    self.search_space = None
    self.number_recommendation = number_recommendation

  def create_search_space(self, dataset):
    self.search_space = list()
    qid_set = set()
    for data in dataset:
      qid = data["qid"]
      text = data["text"]
      tfidf_vector = self.tfidf.vector(text)

      if qid not in qid_set:
        qid_set.add(qid)
        self.search_space.append({"vector": tfidf_vector, "qid": qid, "text":text})

  def recommend(self, text):
    input_vector = self.tfidf.vector(text)
  
    similar_document_list = list()

    for document in self.search_space:
      similarity = self.cosine_similarity(input_vector, document["vector"])
      
      similar_document_list.append({"similarity": similarity, "qid": document["qid"], "text":document["text"]})

    return nlargest(self.number_recommendation, similar_document_list, key=lambda x:x["similarity"])

  @staticmethod
  def cosine_similarity(vector1, vector2):
    return np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))

In [None]:
train_dataset_text = []
for i, data in train_dataset.iterrows():
  train_dataset_text.append(data["question1"] + " " + data["question2"])

In [None]:
tfidf = TFIDF(top_word_number=2000, stop_word_number=100, preprocess=preprocess)

In [None]:
tfidf.process_corpus(train_dataset_text)

In [None]:
tfidf_recommender = TFIDFRecommender(tfidf=tfidf, number_recommendation=10)

In [None]:
train_dataset_tfidf = []
for i, data in train_dataset.iterrows():
  train_dataset_tfidf.append({"qid": data["qid2"], "text": data["question2"]})

tfidf_recommender.create_search_space(train_dataset_tfidf)

# N-gram

In [46]:
class UnigramRecommender():
  def __init__(self, preprocess, number_recommendation, mu=None):
    self.preprocess = preprocess
    self.search_space = None
    self.number_recommendation = number_recommendation
    self.number_tokens = None
    self.fixed_mu = mu

  def calculate_statistics(self, dataset):
    self.cf_dictionary = dict()
    self.search_space = list()

    self.number_tokens = 0

    for data in dataset:
      data_tf_dictionary = dict()
      text = data["text"]
      qid = data["qid"]
      text = self.preprocess(text)
      tokens = text.split()

      for token in tokens:
        self.number_tokens += 1

        if token not in self.cf_dictionary:
          self.cf_dictionary[token] = 0
        self.cf_dictionary[token] += 1

        if token not in data_tf_dictionary:
          data_tf_dictionary[token] = 0
        data_tf_dictionary[token] += 1

      self.search_space.append({"tf_dictionary": data_tf_dictionary,
                                "length":len(tokens), "qid": qid, "text":text})

  def probability(self, document, word, mu):
    return (document["tf_dictionary"].get(word, 0) \
            + mu * self.cf_dictionary.get(word, 1) / self.number_tokens)\
           /(document["length"] + mu)

  def recommend(self, text, mu=None):
    if mu is None:
      mu = self.fixed_mu

    tokens = text.split()
    similar_document_list = list()

    for document in self.search_space:
      document_probability = 0

      for token in tokens:
        document_probability += math.log(self.probability(document=document,
                                         word=token, mu=mu))

      similar_document_list.append({"similarity": document_probability,
                                    "qid": document["qid"], "text":document["text"]})

    return nlargest(self.number_recommendation, similar_document_list, key=lambda x:x["similarity"])

In [47]:
class BigramRecommender():
  def __init__(self, preprocess, number_recommendation, mu=None, lambda_=None):
    self.preprocess = preprocess
    self.search_space = None
    self.number_recommendation = number_recommendation
    self.number_tokens = None
    self.fixed_mu = mu
    self.fixed_lambda = lambda_
    self.unigram_recommender = UnigramRecommender(preprocess=preprocess,
                                                  number_recommendation=number_recommendation,
                                                  mu=mu)
  
  def calculate_statistics(self, dataset):
    self.unigram_recommender.calculate_statistics(dataset)

    self.search_space = list()

    for data in dataset:
      data_tf_dictionary = dict()
      data_ptf_dictionary = dict()
      text = data["text"]
      qid = data["qid"]
      text = self.preprocess(text)
      tokens = text.split()
      tokens = ["START"] + tokens

      for token1, token2 in zip(tokens[:-1], tokens[1:]):
        if token2 not in data_tf_dictionary:
          data_tf_dictionary[token2] = 0
        data_tf_dictionary[token2] += 1

        if token2 not in data_ptf_dictionary:
          data_ptf_dictionary[token2] = dict()

        if token1 not in data_ptf_dictionary[token2]:
          data_ptf_dictionary[token2][token1] = 0
        
        data_ptf_dictionary[token2][token1] += 1

      self.search_space.append({"tf_dictionary": data_tf_dictionary,
                                "ptf_dictionary": data_ptf_dictionary,
                                "length":len(tokens), "qid": qid, "text":text})


  def probability(self, document, word1, word2, mu, lambda_):
    unigram_probability = self.unigram_recommender.probability(document=document, word=word2, mu=mu)

    if document["tf_dictionary"].get(word1) is None:
      bigram_probability = 0
    else:
      bigram_probability = document["ptf_dictionary"].get(word1).get(word2, 0) / document["tf_dictionary"].get(word1)
    
    return lambda_ * bigram_probability + (1 - lambda_) * unigram_probability

  def recommend(self, text, mu=None, lambda_=None):
    if mu is None:
      mu = self.fixed_mu

    if lambda_ is None:
      lambda_ = self.fixed_lambda

    tokens = text.split()
    tokens = ["START"] + tokens

    similar_document_list = list()

    for document in self.search_space:
      document_probability = 0

      for token1, token2 in zip(tokens[:-1], tokens[1:]):
        document_probability += math.log(self.probability(document=document,
                                         word1=token1, word2=token2,
                                         mu=mu, lambda_=lambda_))

      similar_document_list.append({"similarity": document_probability,
                                    "qid": document["qid"], "text":document["text"]})

    return nlargest(self.number_recommendation, similar_document_list, key=lambda x:x["similarity"])

In [49]:
train_dataset_ngram = list()

qid_set = set()

for i, data in train_dataset.iterrows():
  qid = data["qid2"]
  if qid not in qid_set:
    qid_set.add(qid)
    train_dataset_ngram.append({"text": data["question2"], "qid": qid})

In [48]:
unigram_recommender = UnigramRecommender(preprocess=preprocess, number_recommendation=10)

In [50]:
unigram_recommender.calculate_statistics(dataset=train_dataset_ngram)

In [51]:
bigram_recommender = BigramRecommender(preprocess=preprocess, number_recommendation=10, mu=0.5, lambda_=0.5)

In [52]:
bigram_recommender.calculate_statistics(dataset=train_dataset_ngram)

In [53]:
bigram_recommender.recommend(test_dataset[test_dataset["qid1"]==4804]["question1"].values[0])

[{'similarity': -50.09379809933493,
  'qid': 444347,
  'text': 'what are some psychological hacks for better focus'},
 {'similarity': -51.24012951991118,
  'qid': 302717,
  'text': 'what are some psychological hacks that you can do daily'},
 {'similarity': -52.202185420362255,
  'qid': 74355,
  'text': 'what are some psychological tricks to stop caring about what people think'},
 {'similarity': -52.618376516126126,
  'qid': 59404,
  'text': 'what are some good horror movies'},
 {'similarity': -52.618376516126126,
  'qid': 119683,
  'text': 'what are some good short stories'},
 {'similarity': -52.618376516126126,
  'qid': 12346,
  'text': 'what are some good anime movies'},
 {'similarity': -52.618376516126126,
  'qid': 170004,
  'text': 'which are some good anime movies'},
 {'similarity': -53.369355373850155,
  'qid': 73559,
  'text': 'what are some good smartphones under 15k'},
 {'similarity': -53.369355373850155,
  'qid': 229844,
  'text': 'what are some good methods to studying'},
 {

In [None]:
unigram_recommender.recommend(test_dataset[test_dataset["qid1"]==4804]["question1"].values[0], mu=0.5)

[{'similarity': 2.5081252293437256e-20,
  'qid': 444347,
  'text': 'what are some psychological hacks for better focus'},
 {'similarity': 7.058736605789247e-21,
  'qid': 302717,
  'text': 'what are some psychological hacks that you can do daily'},
 {'similarity': 4.089551945334351e-21,
  'qid': 74355,
  'text': 'what are some psychological tricks to stop caring about what people think'},
 {'similarity': 2.468145134018605e-21,
  'qid': 59404,
  'text': 'what are some good horror movies'},
 {'similarity': 2.468145134018605e-21,
  'qid': 119683,
  'text': 'what are some good short stories'},
 {'similarity': 2.468145134018605e-21,
  'qid': 12346,
  'text': 'what are some good anime movies'},
 {'similarity': 2.468145134018605e-21,
  'qid': 170004,
  'text': 'which are some good anime movies'},
 {'similarity': 1.0458833598847482e-21,
  'qid': 73559,
  'text': 'what are some good smartphones under 15k'},
 {'similarity': 1.0458833598847482e-21,
  'qid': 229844,
  'text': 'what are some good me

# Evaluation

In [54]:
class Evaluator():
  def __init__(self, recommender):
    self.recommender = recommender

  def evaluate_p_at_n(self, dataset, n):
    precision = 0
    
    for data in dataset:
      text = data["text"]
      positive_qids = data["positive_qids"]
    
      outputs = self.recommender.recommend(text)

      matched_number = 0

      for i in range(n):
        qid = outputs[i]["qid"]

        if qid in positive_qids:
          matched_number += 1

      precision += matched_number/n

    return precision / len(dataset)

  def evaluate_map(self, dataset):
    sum_ap = 0
    for data in dataset:    
      ap = self._evaluate_ap(data)
      sum_ap += ap

    map = sum_ap / len(dataset)
    return map

  def _evaluate_ap(self, data):
    text = data["text"]
    positive_qids = data["positive_qids"]

    outputs = self.recommender.recommend(text)

    matched_number = 0
    sum_precision = 0
    for i, output in enumerate(outputs):
      if output["qid"] in positive_qids:
        matched_number += 1

        precision = matched_number / (i+1)
        sum_precision += precision
    
    return sum_precision / len(positive_qids)

  def evaluate_mrr(self, dataset):
    sum_rr = 0
    for data in dataset:    
      rr = self._evaluate_rr(data)
      sum_rr += rr

    mrr = sum_rr / len(dataset)
    return mrr

  def _evaluate_rr(self, data):
    text = data["text"]
    positive_qids = data["positive_qids"]

    outputs = self.recommender.recommend(text)

    for i, output in enumerate(outputs):
      if output["qid"] in positive_qids:
        return 1 / (i+1)
    
    return 0
  
  def evaluation_report(self, dataset):
    print("Evaluation Summary:")
    print(f"P@5  = {round(self.evaluate_p_at_n(dataset=dataset, n=5)*100, 2)}%")
    print(f"P@10 = {round(self.evaluate_p_at_n(dataset=dataset, n=10)*100, 2)}%")
    print(f"MAP  = {round(self.evaluate_map(dataset=dataset)*100, 2)}%")
    print(f"MRR  = {round(self.evaluate_mrr(dataset=dataset)*100, 2)}%")

In [55]:
qid_set = set(test_dataset["qid1"].values)

test_dataset_evaluation = []

for qid in qid_set:

  qid_test_dataset = test_dataset[test_dataset["qid1"]==qid]

  text = qid_test_dataset.iloc[0]["question1"]
  positive_qids = set(qid_test_dataset["qid2"].values)

  test_dataset_evaluation.append({"text": text, "positive_qids": positive_qids})

In [None]:
evaluator = Evaluator(tfidf_recommender)

In [56]:
evaluator.evaluation_report(dataset=test_dataset_evaluation)

Evaluation Summary:
P@5  = 35.34
P@10 = 25.89
MAP  = 32.96
MRR  = 63.12


In [None]:
unigram_recommender = UnigramRecommender(preprocess=preprocess, number_recommendation=10, mu=0.5)
unigram_recommender.calculate_statistics(dataset=train_dataset_ngram)
evaluator = Evaluator(unigram_recommender)

In [None]:
evaluator.evaluation_report(dataset=test_dataset_evaluation)

Evaluation Summary:
P@5  = 36.58
P@10 = 25.55
MAP  = 33.45
MRR  = 62.74


In [None]:
unigram_recommender = UnigramRecommender(preprocess=preprocess, number_recommendation=10, mu=0.5)
unigram_recommender.calculate_statistics(dataset=train_dataset_ngram)
evaluator = Evaluator(unigram_recommender)

In [None]:
evaluator.evaluation_report(dataset=test_dataset_evaluation)

Evaluation Summary:
P@5  = 36.58
P@10 = 25.55
MAP  = 33.45
MRR  = 62.74


In [None]:
bigram_recommender = BigramRecommender(preprocess=preprocess, number_recommendation=10, mu=0.5, lambda_=0)
bigram_recommender.calculate_statistics(dataset=train_dataset_ngram)
evaluator = Evaluator(bigram_recommender)

In [None]:
evaluator.evaluation_report(dataset=test_dataset_evaluation)

Evaluation Summary:
P@5  = 36.16
P@10 = 26.03
MAP  = 33.72
MRR  = 62.31


In [None]:
bigram_recommender = BigramRecommender(preprocess=preprocess, number_recommendation=10, mu=0.5, lambda_=1)
bigram_recommender.calculate_statistics(dataset=train_dataset_ngram)
evaluator = Evaluator(bigram_recommender)

In [None]:
evaluator.evaluation_report(dataset=test_dataset_evaluation)

Evaluation Summary:
P@5  = 0.0
P@10 = 0.0
MAP  = 0.0
MRR  = 0.0


In [None]:
bigram_recommender = BigramRecommender(preprocess=preprocess, number_recommendation=10, mu=0.5, lambda_=0.9)
bigram_recommender.calculate_statistics(dataset=train_dataset_ngram)
evaluator = Evaluator(bigram_recommender)

In [None]:
evaluator.evaluation_report(dataset=test_dataset_evaluation)

Evaluation Summary:
P@5  = 35.34
P@10 = 25.89
MAP  = 32.96
MRR  = 63.12
