<a href="https://colab.research.google.com/github/AUT-Student/IR-HW1/blob/main/IR_HW1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dataset

In [1]:
!gdown 1uJYlPDV4-V-hHQSIhlq1ruKBL6SR96Te

Downloading...
From: https://drive.google.com/uc?id=1uJYlPDV4-V-hHQSIhlq1ruKBL6SR96Te
To: /content/IR_HW1_dataset.zip
  0% 0.00/1.76M [00:00<?, ?B/s]100% 1.76M/1.76M [00:00<00:00, 139MB/s]


In [2]:
!unzip /content/IR_HW1_dataset.zip

Archive:  /content/IR_HW1_dataset.zip
   creating: Data/
  inflating: Data/test_data.csv      
  inflating: Data/train_data.csv     
  inflating: Data/valid_data.csv     


In [3]:
import pandas as pd
import numpy as np
import math

In [4]:
train_dataset = pd.read_csv("/content/Data/train_data.csv")
valid_dataset = pd.read_csv("/content/Data/valid_data.csv")
test_dataset  = pd.read_csv("/content/Data/test_data.csv")

# Preprocessing

In [5]:
import string
def remove_punctuation(text):
  output = ""
  for char in text:
    if char not in string.punctuation:
      output += char
  return output

In [6]:
def make_lower_case(text):
  return text.lower()

In [7]:
def preprocess(text):
  text = remove_punctuation(text)
  text = make_lower_case(text)
  return text

# TF-IDF

In [8]:
class TFIDF():
  def __init__(self, top_word_number, stop_word_number, preprocess):
    self.df_dictionary = dict()
    self.top_words = None
    self.document_number= None
    self.top_word_number = top_word_number
    self.stop_word_number = stop_word_number
    self.preprocess = preprocess 

  def process_corpus(self, dataset):
    self.document_number = len(dataset)
    self._calculate_df(dataset)
    self._select_top_words()
    self._calculate_idf()

  def _calculate_df(self, dataset):
    for data in dataset:
      text = self.preprocess(data)
      tokens = set(text.split())

      for token in tokens:
        if token not in self.df_dictionary:
          self.df_dictionary[token] = 0
        
        self.df_dictionary[token] += 1

  def _select_top_words(self):
    df_list = []

    for word in self.df_dictionary:
      df_list.append({"word": word, "df": self.df_dictionary[word]})

    df_list = sorted(df_list, key=lambda x:-x["df"])

    stop_words = set([item["word"] for item in df_list[:self.stop_word_number]])

    self.top_words = set([item["word"] for item in df_list[self.stop_word_number:self.stop_word_number + self.top_word_number]])
  
  def _calculate_idf(self):
    self.idf_dictionary = dict()

    for word in self.top_words:
      self.idf_dictionary[word] = math.log10(self.document_number / self.df_dictionary[word])

    self.word_id_dictionary = dict()

    for i, word in enumerate(self.top_words):
      self.word_id_dictionary[word] = i

  def vector(self, text):
    text = self.preprocess(text)
    words = text.split()

    tfidf_vector = np.zeros(self.top_word_number)

    tf_dictionary = dict()

    for word in words:
      if word not in self.top_words: continue
      if word not in tf_dictionary:
        tf_dictionary[word] = 0
      
      tf_dictionary[word] += 1
    
    for word in tf_dictionary:
      tf = 1 + math.log10(tf_dictionary[word])
      wid = self.word_id_dictionary[word]
      idf = self.idf_dictionary[word]

      tfidf_vector[wid] = tf * idf

    return tfidf_vector

In [9]:
train_dataset_text = []
for i, data in train_dataset.iterrows():
  train_dataset_text.append(data["question1"] + " " + data["question2"])

In [10]:
tfidf = TFIDF(top_word_number=2000, stop_word_number=100, preprocess=preprocess)

In [11]:
tfidf.process_corpus(train_dataset_text)

In [43]:
from heapq import nlargest

class TFIDFRecommender():
  def __init__(self, tfidf, number_recommendation):
    self.tfidf = tfidf
    self.search_space = None
    self.number_recommendation = number_recommendation

  def create_search_space(self, dataset):
    self.search_space = list()
    qid_set = set()
    for data in dataset:
      qid = data["qid"]
      text = data["text"]
      tfidf_vector = self.tfidf.vector(text)

      if qid not in qid_set:
        qid_set.add(qid)
        self.search_space.append({"vector": tfidf_vector, "qid": qid, "text":text})

  def recommend(self, text):
    input_vector = self.tfidf.vector(text)
  
    similar_item_list = list()

    for item in self.search_space:
      similarity = self.cosine_similarity(input_vector, item["vector"])
      
      similar_item_list.append({"similarity": similarity, "qid": item["qid"], "text":item["text"]})

    return nlargest(self.number_recommendation, similar_item_list, key=lambda x:x["similarity"])

  @staticmethod
  def cosine_similarity(vector1, vector2):
    return np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))

In [44]:
tfidf_recommender = TFIDFRecommender(tfidf=tfidf, number_recommendation=10)

In [45]:
train_dataset_text = []
for i, data in train_dataset.iterrows():
  train_dataset_text.append({"qid": data["qid2"], "text": data["question2"]})

tfidf_recommender.create_search_space(train_dataset_text)

In [87]:
test_dataset.iloc[150]["question1"]

'How can I add a question with picture on Quora?'

In [90]:
test_dataset.iloc[150]

id                                                        66278
qid1                                                      47245
qid2                                                      74521
question1       How can I add a question with picture on Quora?
question2         How do I add pictures to my answers on Quora?
is_duplicate                                                  1
Name: 150, dtype: object

In [88]:
tfidf_recommender.recommend(test_dataset.iloc[150]["question1"])



[{'similarity': 0.8396144551355388,
  'qid': 35737,
  'text': 'How can I add my profile picture on Qoura?'},
 {'similarity': 0.8396144551355388,
  'qid': 48484,
  'text': 'How do you add a profile picture on Quora?'},
 {'similarity': 0.7695866076338181,
  'qid': 53238,
  'text': 'How do I add a profile picture to my Quora account?'},
 {'similarity': 0.7435323915171036,
  'qid': 178548,
  'text': 'How do I add or change my Quora profile picture?'},
 {'similarity': 0.7091057298219843,
  'qid': 34464,
  'text': 'How do I add tags to my question in Quora?'},
 {'similarity': 0.7091057298219843,
  'qid': 92882,
  'text': 'Add questions on quora?'},
 {'similarity': 0.7091057298219843,
  'qid': 47114,
  'text': 'How do I add an image to my question?'},
 {'similarity': 0.7091057298219843,
  'qid': 192621,
  'text': 'How do I add a topic in Quora?'},
 {'similarity': 0.7091057298219843,
  'qid': 95577,
  'text': 'Can I add images to my questions on Quora?'},
 {'similarity': 0.7091057298219843,
  

In [91]:
test_dataset[test_dataset["qid1"]==47245]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
61,25341,47245,31450,How can I add a question with picture on Quora?,How do you add an image to a question or a pos...,1
150,66278,47245,74521,How can I add a question with picture on Quora?,How do I add pictures to my answers on Quora?,1
160,69375,47245,98621,How can I add a question with picture on Quora?,How do I add an image to a question on Quora?,1
240,107520,47245,14735,How can I add a question with picture on Quora?,How do I add photos to my questions in Quora?,1
368,158419,47245,165601,How can I add a question with picture on Quora?,How do I insert a picture in Quora?,1
413,177257,47245,34185,How can I add a question with picture on Quora?,How can I add photos to my question in Quora?,1
513,218278,47245,95577,How can I add a question with picture on Quora?,Can I add images to my questions on Quora?,1
615,257907,47245,31340,How can I add a question with picture on Quora?,How do I add photos to my questions on Quora?,1


# Evaluation

146

In [56]:
test_dataset[test_dataset["qid1"]==4804]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
3,2417,4804,4805,What are some good psychological hacks?,What is the coolest psychological trick?,1
133,58018,4804,101877,What are some good psychological hacks?,What are some Psychology hacks?,1
826,341733,4804,202118,What are some good psychological hacks?,What are the most amazing psychological hacks?,1


In [92]:
class Evaluator():
  def __init__(self, recommender):
    self.recommender = recommender

  def evaluate_p_at_n(self, dataset, n):
    precision = 0
    
    for data in dataset:
      text = data["text"]
      positive_qids = data["positive_qids"]
    
      outputs = self.recommender.recommend(text)

      matched_number = 0

      for i in range(n):
        qid = outputs[i]["qid"]

        if qid in positive_qids:
          matched_number += 1

      precision += matched_number/n

    return precision / len(dataset)

  def evaluate_map(self, dataset):
    sum_ap = 0
    for data in dataset:    
      ap = self._evaluate_ap(data)
      sum_ap += ap

    map = sum_ap / len(dataset)
    return map

  def _evaluate_ap(self, data):
    text = data["text"]
    positive_qids = data["positive_qids"]

    outputs = self.recommender.recommend(text)

    matched_number = 0
    sum_precision = 0
    for i, output in enumerate(outputs):
      if output["qid"] in positive_qids:
        matched_number += 1

        precision = matched_number / (i+1)
        sum_precision += precision
    
    return sum_precision / len(positive_qids)

  def evaluate_mrr(self, dataset):
    sum_rr = 0
    for data in dataset:    
      rr = self._evaluate_rr(data)
      sum_rr += rr

    mrr = sum_rr / len(dataset)
    return mrr

  def _evaluate_rr(self, data):
    text = data["text"]
    positive_qids = data["positive_qids"]

    outputs = self.recommender.recommend(text)

    for i, output in enumerate(outputs):
      if output["qid"] in positive_qids:
        return 1 / (i+1)
    
    return 0

In [62]:
qid_set = set(test_dataset["qid1"].values)

test_dataset_evaluation = []

for qid in qid_set:

  qid_test_dataset = test_dataset[test_dataset["qid1"]==qid]

  text = qid_test_dataset.iloc[0]["question1"]
  positive_qids = set(qid_test_dataset["qid2"].values)

  test_dataset_evaluation.append({"text": text, "positive_qids": positive_qids})

In [93]:
evaluator = Evaluator(tfidf_recommender)

In [65]:
evaluator.evaluate_p_at_n(dataset=test_dataset_evaluation, n=10)



0.31712328767123277

In [66]:
evaluator.evaluate_p_at_n(dataset=test_dataset_evaluation, n=5)



0.4452054794520548

In [85]:
evaluator.evaluate_map(dataset=test_dataset_evaluation)



0.3947572148015441

In [94]:
evaluator.evaluate_mrr(dataset=test_dataset_evaluation)



0.6277370080452273