In [9]:
import pickle
import numpy as np
import pandas as pd
from collections import Counter
# import ipdb
np.seterr(divide='ignore', invalid='ignore')


def compute_tf_idf(word_list, op_type, V, N, idf):
  """
  Applies the term freq operation on the word list

  Args:
      word_list (token_id, freq): 
      op_type (str): "lan"
      V (int): size of the vocab
      N (int): total no of docs
  Returns:
      final_vector : the vector representing the term
  """
  print("computing tf id")
  
  op_type = op_type.lower()
  final_vector = np.zeros(V)
  idf_vector = np.zeros(V)
  
  # First operation
  if op_type[0] not in "lan":
    return Exception("Invalid operation")
  
  else:
    for idx, freq in word_list:
      final_vector[idx] = freq
          
    if op_type[0] == "l":
      final_vector = np.log(1 + final_vector)
      
    elif op_type[0] == "a":
      final_vector = 0.5 + 0.5 * final_vector / np.max(final_vector)

  # Second operation
  if op_type[1] not in "ntp":
    return Exception("Invalid operation")
  
  else:
    # final_vector = final_vector + 1e-20
    ipdb.set_trace()
    for idx, freq in word_list:
      idf_vector[idx] = idf[idx]

    if op_type[1] == "t":
      idf_vector = np.log(N / idf_vector )
      
    elif op_type[1] == "p":
      idf_vector = np.log(N / idf_vector - 1)

  final_vector = final_vector * idf_vector
  
  # Third operation
  if op_type[2] not in "cn":
    return Exception("Invalid operation")
  
  return final_vector / np.linalg.norm(final_vector) if op_type[2] == "c" else final_vector


def transpose_inv_idx(inv_idx):
  """
  Takes a transpose of the inverted index

  Args:
      inv_idx (dict): token to (doc_id, freq) mapping

  Returns:
      new_idx(dict): doc_id to (token, freq) mapping
      mapping(dict): token to token_id mapping
      idf(dict): token to idf mapping
  """
  
  idf = dict()
  N = len(inv_idx.keys())
  new_idx = dict()
  mapper = dict()
  
  for idx, key in enumerate(sorted(inv_idx.keys())):
      mapper[key] = idx
      idf[key] = np.log(N / len(inv_idx[key]))
      for cord_id, freq in inv_idx[key]:
        if cord_id not in new_idx:
          new_idx[cord_id] = []
        new_idx[cord_id].append((idx, freq))
  
  return new_idx, mapper, idf


def get_query_postings(queries, mapper):
  """
  Get a map from query ids to tokens

  Args:
      queries (DataFrame): The queries dataframe
      mapper (dict): token to token_id mapping

  Returns:
      
      query_vector: The mapping from query_id to tokens
  """
  
  query_vector = dict()

  for idx, query in zip(queries["topic-id"], queries["query"]):
    words = query.split()
    query_vector[idx] = []
    tokens = []

    for word in words:
      if word in mapper:
        tokens.append(mapper[word]) 
            
    query_vector[idx] = list(Counter(tokens).items())
  
  return query_vector


def get_ranks(new_idx, query_vector, V, method, output_file, idf):
  """Returns a dict containing the query id vs the docs

  Args:
      new_idx (dict): doc_id to (token, freq) mapping
      query_vector (dict): query_id to tokens mapping
      V (int): vocab size
  """
  
  doc_method, query_method = method.split('.')
  
  with open(output_file, "w") as f:
    for query_id, query_token_list in query_vector.items():
      scores = []
      f.write(str(query_id) + ",")
      query_vector = compute_tf_idf(query_token_list, query_method, V, len(new_idx.keys()), idf)
      
      for doc_id, doc_token_list in new_idx.items():
        doc_vector = compute_tf_idf(doc_token_list, doc_method, V, len(new_idx.keys()), idf)
        scores.append((doc_id, np.dot(query_vector, doc_vector)))
        
      scores.sort(key=lambda x: x[1], reverse=True) 
      scores = scores[:50]
      for score in scores:
        f.write(str(score[0]) + ",")
      f.write("\n")



ModuleNotFoundError: No module named 'ipdb'

In [10]:

inv_idx_file = "model_queries_10.bin"
query_file = "./Data/queries_10.txt"
configs = {
"lnc.ltc": "Assignment2_10_ranked_list_A.csv",
"lnc.lpc": "Assignment2_10_ranked_list_B.csv",
"anc.apc": "Assignment2_10_ranked_list_C.csv"
}

# Load old inverted index
with open(inv_idx_file, 'rb') as f:
    inv_idx = pickle.load(f)

# Transpose inverted index for optimizing space
new_idx, mapper, idf = transpose_inv_idx(inv_idx)  

# Get queries to tokens mapping 
queries = pd.read_csv(query_file)
query_vector = get_query_postings(queries, mapper)

# Get the ranks and save for different configs
for config, output_file in configs.items():
    get_ranks(new_idx, query_vector, len(mapper), config, output_file, idf)

computing tf id


NameError: name 'ipdb' is not defined

In [2]:
a = 1

In [14]:
import ipdb

ModuleNotFoundError: No module named 'ipdb'

In [15]:
!pip list

Package            Version
------------------ ---------
backcall           0.2.0
click              8.1.3
colorama           0.4.5
decorator          5.1.1
importlib-metadata 4.12.0
ipdb               0.13.9
ipython            7.34.0
jedi               0.18.1
joblib             1.1.0
matplotlib-inline  0.1.6
nltk               3.7
numpy              1.21.6
pandas             1.3.5
parso              0.8.3
pickleshare        0.7.5
pip                22.2.2
prompt-toolkit     3.0.32
Pygments           2.13.0
python-dateutil    2.8.2
pytz               2022.2.1
regex              2022.8.17
setuptools         63.4.3
six                1.16.0
toml               0.10.2
tqdm               4.64.1
traitlets          5.5.0
typing_extensions  4.3.0
wcwidth            0.2.5
wheel              0.37.1
zipp               3.8.1



[notice] A new release of pip available: 22.2.2 -> 22.3.1
[notice] To update, run: E:\Books\IR\project\IR-project\Assn1\venv\Scripts\python.exe -m pip install --upgrade pip


In [13]:
!pip install ipdb




[notice] A new release of pip available: 22.2.2 -> 22.3.1
[notice] To update, run: E:\Books\IR\project\IR-project\Assn1\venv\Scripts\python.exe -m pip install --upgrade pip


In [26]:
import pickle
import glob
import json

In [3]:

inv_idx_file = "model_queries_10.bin"
# Load old inverted index
with open(inv_idx_file, 'rb') as f:
    inv_idx = pickle.load(f)

In [4]:
len(inv_idx.keys())


146080

In [40]:
docs = set()
for k in inv_idx.keys():
    for d in inv_idx[k]:
        docs.add(d[0])



In [41]:
len(docs)

37330

In [16]:
import pandas as pd

In [17]:
df = pd.read_csv('Data/id_mapping.csv')

In [20]:
df.describe()

Unnamed: 0,cord_id,paper_id
count,52471,52471
unique,52442,52471
top,hmqx925r,0001418189999fea7f7cbe3e82703d71c85a6fe5
freq,2,1


In [35]:
def build_raw_doc(data_dir, coord_file):
  """
    Returns a dictionary of raw documents 
    indexed by the cord id
  """
  # Get the cord ids
  id_maps = pd.read_csv(coord_file, index_col='paper_id')

  my_raw_documents = dict()
  counter = 0
  for filename in glob.iglob(f"{data_dir}*.json", recursive = True):
    if(counter%1000 == 0):
      print(f"Iteration {counter} started")
    counter += 1
    with open(filename, 'r') as f:
        data = json.load(f)
      
    # Get the paper id
    paper_id = data['paper_id']
    # Get the associated cord id
    cord_id = id_maps.loc[paper_id].cord_id
    # Create if doesnt exist
    if cord_id not in my_raw_documents:
      my_raw_documents[cord_id] = ""
      
    # Get all the text from the files
    # for line in data['abstract']:
    #   my_raw_documents[cord_id]+=(" "+line['text'])
      
  return my_raw_documents



In [36]:
data_dir = 'Data/_CORD-19/'
coord_file = 'Data/id_mapping.csv'

In [37]:
my_raw_documents = build_raw_doc(data_dir, coord_file)

Iteration 0 started
Iteration 1000 started
Iteration 2000 started
Iteration 3000 started
Iteration 4000 started
Iteration 5000 started
Iteration 6000 started
Iteration 7000 started
Iteration 8000 started
Iteration 9000 started
Iteration 10000 started
Iteration 11000 started
Iteration 12000 started
Iteration 13000 started
Iteration 14000 started
Iteration 15000 started
Iteration 16000 started
Iteration 17000 started
Iteration 18000 started
Iteration 19000 started
Iteration 20000 started
Iteration 21000 started
Iteration 22000 started
Iteration 23000 started
Iteration 24000 started
Iteration 25000 started
Iteration 26000 started
Iteration 27000 started
Iteration 28000 started
Iteration 29000 started
Iteration 30000 started
Iteration 31000 started
Iteration 32000 started
Iteration 33000 started
Iteration 34000 started
Iteration 35000 started
Iteration 36000 started
Iteration 37000 started
Iteration 38000 started
Iteration 39000 started
Iteration 40000 started
Iteration 41000 started
Itera

In [39]:
my_raw_documents

{'k2uydo6j': '',
 'o81b9htu': '',
 's31mbs7a': '',
 '0tqwjdhd': '',
 '9k0bsktb': '',
 '5k0ktboh': '',
 'zugd4rji': '',
 'g24zbcd5': '',
 'h7iykrrl': '',
 'vxpyony9': '',
 '5f25yqzf': '',
 'uijwq8gx': '',
 'oveybgbc': '',
 'q356npb7': '',
 '0dxl6t4p': '',
 'xcii4zlt': '',
 'podcwiyt': '',
 'o684am08': '',
 'ay4wxf88': '',
 'barmkkwx': '',
 'hj77rg85': '',
 '8cwwx7v3': '',
 'y0c39p7a': '',
 'rjh1ahi6': '',
 'gdveadvv': '',
 'dgjypnga': '',
 'bnnw8cpn': '',
 '8sgbf22u': '',
 'tldg8c94': '',
 '3mqs7mj1': '',
 'm5ho8jqp': '',
 '21e00h2d': '',
 'aq94ybqi': '',
 'p5orjg14': '',
 'ya81rc2n': '',
 '1bqpc3x3': '',
 'ky3icm1m': '',
 'mrsmwxvo': '',
 'i8ywbu6u': '',
 'zxln007a': '',
 'lndtq585': '',
 'm2xtdi6u': '',
 'd938zs2e': '',
 '38tajokl': '',
 'rac71o9c': '',
 '3877dfj9': '',
 '05q8aki0': '',
 'vgw8uz83': '',
 '54hh0ppq': '',
 'zu9a0tfq': '',
 'gblx3u2k': '',
 'r0ej6jep': '',
 'ija2pb97': '',
 'oqte7dx2': '',
 'ouafdyyc': '',
 'uhv28ji7': '',
 'fgjpuzm6': '',
 'q14gfwnt': '',
 '8bzu3jiu': '

In [42]:
for d in my_raw_documents.keys():
    if d not in docs:
        print(d)
        break

o81b9htu
