<a href="https://colab.research.google.com/github/AUT-Student/IR-HW1/blob/main/IR_HW1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dataset

In [1]:
!gdown 1uJYlPDV4-V-hHQSIhlq1ruKBL6SR96Te

Downloading...
From: https://drive.google.com/uc?id=1uJYlPDV4-V-hHQSIhlq1ruKBL6SR96Te
To: /content/IR_HW1_dataset.zip
  0% 0.00/1.76M [00:00<?, ?B/s]100% 1.76M/1.76M [00:00<00:00, 139MB/s]


In [2]:
!unzip /content/IR_HW1_dataset.zip

Archive:  /content/IR_HW1_dataset.zip
   creating: Data/
  inflating: Data/test_data.csv      
  inflating: Data/train_data.csv     
  inflating: Data/valid_data.csv     


In [3]:
import pandas as pd
import numpy as np
import math

In [4]:
train_dataset = pd.read_csv("/content/Data/train_data.csv")
valid_dataset = pd.read_csv("/content/Data/valid_data.csv")
test_dataset  = pd.read_csv("/content/Data/test_data.csv")

# Preprocessing

In [5]:
import string
def remove_punctuation(text):
  output = ""
  for char in text:
    if char not in string.punctuation:
      output += char
  return output

In [6]:
def make_lower_case(text):
  return text.lower()

In [7]:
def preprocess(text):
  text = remove_punctuation(text)
  text = make_lower_case(text)
  return text

# TF-IDF

In [8]:
class TFIDF():
  def __init__(self, top_word_number, stop_word_number, preprocess):
    self.df_dictionary = dict()
    self.top_words = None
    self.document_number= None
    self.top_word_number = top_word_number
    self.stop_word_number = stop_word_number
    self.preprocess = preprocess 

  def process_corpus(self, dataset):
    self.document_number = len(dataset)
    self._calculate_df(dataset)
    self._select_top_words()
    self._calculate_idf()

  def _calculate_df(self, dataset):
    for data in dataset:
      text = self.preprocess(data)
      tokens = set(text.split())

      for token in tokens:
        if token not in self.df_dictionary:
          self.df_dictionary[token] = 0
        
        self.df_dictionary[token] += 1

  def _select_top_words(self):
    df_list = []

    for word in self.df_dictionary:
      df_list.append({"word": word, "df": self.df_dictionary[word]})

    df_list = sorted(df_list, key=lambda x:-x["df"])

    stop_words = set([item["word"] for item in df_list[:self.stop_word_number]])

    self.top_words = set([item["word"] for item in df_list[self.stop_word_number:self.stop_word_number + self.top_word_number]])
  
  def _calculate_idf(self):
    self.idf_dictionary = dict()

    for word in self.top_words:
      self.idf_dictionary[word] = math.log10(self.document_number / self.df_dictionary[word])

    self.word_id_dictionary = dict()

    for i, word in enumerate(self.top_words):
      self.word_id_dictionary[word] = i

  def vector(self, text):
    text = self.preprocess(text)
    words = text.split()

    tfidf_vector = np.zeros(self.top_word_number)

    tf_dictionary = dict()

    for word in words:
      if word not in self.top_words: continue
      if word not in tf_dictionary:
        tf_dictionary[word] = 0
      
      tf_dictionary[word] += 1
    
    for word in tf_dictionary:
      tf = 1 + math.log10(tf_dictionary[word])
      wid = self.word_id_dictionary[word]
      idf = self.idf_dictionary[word]

      tfidf_vector[wid] = tf * idf

    return tfidf_vector

In [9]:
train_dataset_text = []
for i, data in train_dataset.iterrows():
  train_dataset_text.append(data["question1"] + " " + data["question2"])

In [10]:
tfidf = TFIDF(top_word_number=2000, stop_word_number=100, preprocess=preprocess)

In [11]:
tfidf.process_corpus(train_dataset_text)

In [32]:
from heapq import nlargest

class TFIDFRecommender():
  def __init__(self, tfidf, number_recommendation):
    self.tfidf = tfidf
    self.search_space = None
    self.number_recommendation = number_recommendation

  def create_search_space(self, dataset):
    self.search_space = list()
    for data in dataset:
      qid = data["qid"]
      text = data["text"]
      tfidf_vector = self.tfidf.vector(text)

      self.search_space.append({"vector": tfidf_vector, "qid": qid, "text":text})

  def recommend(self, text):
    input_vector = self.tfidf.vector(text)
  
    similar_item_list = list()

    for item in self.search_space:
      similarity = self.cosine_similarity(input_vector, item["vector"])
      
      similar_item_list.append({"similarity": similarity, "qid": item["qid"], "text":item["text"]})

    return nlargest(self.number_recommendation, similar_item_list, key=lambda x:x["similarity"])

  @staticmethod
  def cosine_similarity(vector1, vector2):
    return np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))

In [33]:
tfidf_recommender = TFIDFRecommender(tfidf=tfidf, number_recommendation=10)

In [34]:
train_dataset_text = []
for i, data in train_dataset.iterrows():
  train_dataset_text.append({"qid": data["qid2"], "text": data["question2"]})

tfidf_recommender.create_search_space(train_dataset_text)

In [37]:
train_dataset.iloc[100]["question1"]

'Where can I get fantastic value in Sydney for floor tiles?'

In [38]:
tfidf_recommender.recommend(train_dataset.iloc[100]["question1"])



[{'similarity': 0.7063560652001621,
  'qid': 50821,
  'text': 'Where can I get affordable package in Sydney for floor tiles?'},
 {'similarity': 0.6876236091636733,
  'qid': 79955,
  'text': 'Where can I get huge selection of floor tiles in Sydney?'},
 {'similarity': 0.6876236091636733,
  'qid': 79955,
  'text': 'Where can I get huge selection of floor tiles in Sydney?'},
 {'similarity': 0.6876236091636733,
  'qid': 79955,
  'text': 'Where can I get huge selection of floor tiles in Sydney?'},
 {'similarity': 0.6876236091636733,
  'qid': 79955,
  'text': 'Where can I get huge selection of floor tiles in Sydney?'},
 {'similarity': 0.6876236091636733,
  'qid': 79955,
  'text': 'Where can I get huge selection of floor tiles in Sydney?'},
 {'similarity': 0.6876236091636733,
  'qid': 79955,
  'text': 'Where can I get huge selection of floor tiles in Sydney?'},
 {'similarity': 0.6876236091636733,
  'qid': 79955,
  'text': 'Where can I get huge selection of floor tiles in Sydney?'},
 {'similari