<a href="https://colab.research.google.com/github/AUT-Student/IR-HW1/blob/main/IR_HW1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!gdown 1uJYlPDV4-V-hHQSIhlq1ruKBL6SR96Te

Downloading...
From: https://drive.google.com/uc?id=1uJYlPDV4-V-hHQSIhlq1ruKBL6SR96Te
To: /content/IR_HW1_dataset.zip
  0% 0.00/1.76M [00:00<?, ?B/s]100% 1.76M/1.76M [00:00<00:00, 154MB/s]


In [None]:
!unzip /content/IR_HW1_dataset.zip

Archive:  /content/IR_HW1_dataset.zip
   creating: Data/
  inflating: Data/test_data.csv      
  inflating: Data/train_data.csv     
  inflating: Data/valid_data.csv     


In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
train_dataset = pd.read_csv("/content/Data/train_data.csv")
valid_dataset = pd.read_csv("/content/Data/valid_data.csv")
test_dataset  = pd.read_csv("/content/Data/test_data.csv")

In [3]:
import string
def remove_punctuation(text):
  output = ""
  for char in text:
    if char not in string.punctuation:
      output += char
  return output

In [4]:
def make_lower_case(text):
  return text.lower()

In [5]:
def preprocess(text):
  text = remove_punctuation(text)
  text = make_lower_case(text)
  return text

In [6]:
class TFIDF():
  def __init__(self, top_word_number, stop_word_number, preprocess):
    self.df_dictionary = dict()
    self.top_words = None
    self.document_number= None
    self.top_word_number = top_word_number
    self.stop_word_number = stop_word_number
    self.preprocess = preprocess 

  def process_corpus(self, dataset):
    self.document_number = len(dataset)
    self._calculate_df(dataset)
    self._select_top_words()
    self._calculate_idf()

  def _calculate_df(self, dataset):
    for data in dataset:
      text = self.preprocess(data)
      tokens = set(text.split())

      for token in tokens:
        if token not in self.df_dictionary:
          self.df_dictionary[token] = 0
        
        self.df_dictionary[token] += 1

  def _select_top_words(self):
    df_list = []

    for word in self.df_dictionary:
      df_list.append({"word": word, "df": self.df_dictionary[word]})

    df_list = sorted(df_list, key=lambda x:-x["df"])

    stop_words = set([item["word"] for item in df_list[:self.stop_word_number]])

    self.top_words = set([item["word"] for item in df_list[self.stop_word_number:self.stop_word_number + self.top_word_number]])
  
  def _calculate_idf(self):
    self.idf_dictionary = dict()

    for word in self.top_words:
      self.idf_dictionary[word] = math.log10(self.document_number / self.df_dictionary[word])

    self.word_id_dictionary = dict()

    for i, word in enumerate(self.top_words):
      self.word_id_dictionary[word] = i

  def vector(self, text):
    text = self.preprocess(text)
    words = text.split()

    tfidf_vector = np.zeros(self.top_word_number)

    tf_dictionary = dict()

    for word in words:
      if word not in self.top_words: continue
      if word not in tf_dictionary:
        tf_dictionary[word] = 0
      
      tf_dictionary[word] += 1
    
    for word in tf_dictionary:
      tf = 1 + math.log10(tf_dictionary[word])
      wid = self.word_id_dictionary[word]
      idf = self.idf_dictionary[word]

      tfidf_vector[wid] = tf * idf

    return tfidf_vector

In [7]:
train_dataset_text = []
for i, data in train_dataset.iterrows():
  train_dataset_text.append(data["question1"] + " " + data["question2"])

In [8]:
tfidf = TFIDF(top_word_number=2000, stop_word_number=100, preprocess=preprocess)

In [9]:
tfidf.process_corpus(train_dataset_text)