In [3]:
!pip install -q google-cloud-storage==1.43.0

[0m

In [38]:
import sys
from collections import Counter, OrderedDict
import itertools
from itertools import islice, count, groupby
import os
import re
from operator import itemgetter
import nltk
from nltk.stem.porter import *
from nltk.corpus import stopwords
from time import time
from timeit import timeit
from pathlib import Path
import pickle
import numpy as np
from google.cloud import storage
from math import sqrt

import hashlib
def _hash(s):
    return hashlib.blake2b(bytes(s, encoding='utf8'), digest_size=5).hexdigest()

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
os.chdir('home/dataproc')

In [5]:
filename_inverted_text = 'inverted_text_v1.pkl'
filename_inverted_title = 'inverted_title_v1.pkl'
bucket_name = "bgu-ir-ass3-fab"
project_name = 'ir-ass3-414111'

down_text_args = filename_inverted_text, 'text', bucket_name, project_name
down_title_args = filename_inverted_title, 'title', bucket_name, project_name

In [6]:
def down_and_load_pickle(filename, dir, bucket_name, project_name):
    # if not os.path.isfile(filename):
    file_path = f"postings_gcp/{dir}/{filename}"
    storage_client = storage.Client(project=project_name)
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(file_path)
    contents = blob.download_as_bytes()
    # else:
    return pickle.loads(contents)

In [11]:
# inverted title upload
inverted_text = down_and_load_pickle(*down_text_args)
inverted_title = down_and_load_pickle(*down_title_args)

In [12]:
english_stopwords = frozenset(stopwords.words('english'))
corpus_stopwords = ["category", "references", "also", "external", "links",
                    "may", "first", "see", "history", "people", "one", "two",
                    "part", "thumb", "including", "second", "following",
                    "many", "however", "would", "became"]

all_stopwords = english_stopwords.union(corpus_stopwords)
RE_WORD = re.compile(r"""[\#\@\w](['\-]?\w){2,24}""", re.UNICODE)

In [13]:
k1 = 1.2
k3 = 0
b = 0.5

In [86]:
# testing:

def retrieve_posting_list_title(query_word: str, bucket_name: str):
    return inverted_title.read_a_posting_list(base_dir='.', w=query_word, bucket_name=bucket_name)

def retrieve_posting_list_text(query_word: str, bucket_name: str):
    pl = inverted_text.read_a_posting_list(base_dir='.', w=query_word, bucket_name=bucket_name)
    return pl

def calculate_bm25_per_tf(token, doc_id_tf, inverted):
    doc_id, tf = doc_id_tf
    B = 1 - b + b * inverted.doc_len.get(doc_id)
    return ((k1 + 1) / (B * k1 + tf)) * ((inverted.corpus_size + 1) /inverted.idf_bm25[token])


def reduce_by_key(postings_lists):
    bad_indices = []
    for index, pl in enumerate(postings_lists):
        if len(pl) == 0:
            bad_indices.append(index)
    for bad_index in bad_indices[::-1]:
        del postings_lists[bad_index]
    gens = [(item for item in pl) for pl in postings_lists]
    items = [next(gen) for gen in gens]

    combined = []
    while any(tf != 0 for _, tf in items) and len(postings_lists) > 0:
        index, (doc_id, _) = min(enumerate(items), key=lambda x: x[1][0])
        lst = [(_doc_id, _tf) for _doc_id, _tf in items if _doc_id == doc_id]
        combined.append((doc_id, sum(tf for _, tf in lst)))

        for index, item in enumerate(items):
            if item[0] == doc_id:
                try:
                    items[index] = next(gens[index])
                except StopIteration:
                    items[index] = (float('inf'), 0)
    return combined


def query(query: str, bucket_name: str):
    print('begin')
    tokens = [token.group() for token in RE_WORD.finditer(query.lower())]
    tokens = [token for token in tokens if token not in all_stopwords]
    print('pl')
    token_pl_title = {token: retrieve_posting_list_title(token, bucket_name) for token in tokens}
    token_pl_text = {token: retrieve_posting_list_text(token, bucket_name) for token in tokens}

    print('bm25 title')
    bm25_per_term_doc_title = []
    for token in token_pl_title.keys():
        curr_term = []
        for doc_id, tf in token_pl_title[token]:
            curr_term.append((doc_id, calculate_bm25_per_tf(token, (doc_id, tf), inverted_title)))
        bm25_per_term_doc_title.append(curr_term)

    print('bm25 text')
    bm25_per_term_doc_text = []
    for token in token_pl_title.keys():
        curr_term = []
        for doc_id, tf in token_pl_text[token]:
            curr_term.append((doc_id, calculate_bm25_per_tf(token, (doc_id, tf), inverted_text)))
        bm25_per_term_doc_text.append(curr_term)

    print('reduce title')
    # print(bm25_per_term_doc)
    bm25_title = reduce_by_key(bm25_per_term_doc_title)
    print('reduce text')
    bm25_title = list(map(lambda x: (x[0], x[1]), bm25_title))
    bm25_text = reduce_by_key(bm25_per_term_doc_text)
    print('reduce together')
    # bm25_text = list(map(lambda x: (x[0], x[1]), bm25_text))
    bm25 = reduce_by_key([bm25_title, bm25_text])
    bm25 = list(map(lambda x: (x[0], sqrt(inverted_text.pagerank_normalized.get(x[0],0.2) + 1) * (x[1] ** 3)), bm25))

    return sorted(bm25, key=lambda x: x[1], reverse=True)



In [93]:
query(query='Who is the founder of modern psychology?', bucket_name=bucket_name)

begin
pl
bm25 title
bm25 text
reduce title
reduce text
reduce together


[(20563369, 8.822830316357622e+16),
 (22921, 8.518656261016786e+16),
 (4300720, 8.212940842519667e+16),
 (65073296, 7.957071609973725e+16),
 (30791497, 7.404669526876718e+16),
 (17475870, 7.399167826008045e+16),
 (659401, 7.20473041953542e+16),
 (7860932, 7.168646794833302e+16),
 (29816832, 7.116760180253663e+16),
 (1687708, 7.072865541355359e+16),
 (19668784, 7.0392697144327416e+16),
 (4973071, 6.98134958862202e+16),
 (834308, 6.956391464740842e+16),
 (24889823, 6.8381517120251816e+16),
 (12234966, 6.777400343267014e+16),
 (55898994, 6.742666439430358e+16),
 (46333155, 6.726941286740804e+16),
 (1573230, 6.716677702787423e+16),
 (54962507, 6.243202273961056e+16),
 (5235217, 6.004145552977374e+16),
 (11829740, 5.799601195374428e+16),
 (226622, 5.608805460646977e+16),
 (2170251, 5.542920602443141e+16),
 (67213384, 5.3981277292424616e+16),
 (31024376, 5.332230665862094e+16),
 (2590923, 5.240473468546244e+16),
 (41316083, 5.1835712056331464e+16),
 (58458136, 5.167657906892447e+16),
 (60676

In [96]:
# to check if a wikipage exists, and if it does, what is its doc_id


import requests

def get_wikipedia_page_entity_id(title):
    # API endpoint for Wikidata
    wikidata_api_url = "https://www.wikidata.org/w/api.php"

    # Parameters for the API request
    params = {
        "action": "wbgetentities",
        "sites": "enwiki",
        "titles": title,
        "format": "json"
    }

    # Make the API request
    response = requests.get(wikidata_api_url, params=params)

    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()
        # Check if the item exists in Wikidata
        if 'entities' in data:
            entities = data['entities']
            # If the result is not empty, the page exists in Wikidata
            if len(entities) > 0:
                # Return the entity ID (QID) of the page
                entity_id = list(entities.keys())[0]
                return entity_id
    # If there's an error or no entities were found, return None
    return None

# Example usage
title = "Gestapo"  # Replace with the title of the Wikipedia page you want to check
entity_id = get_wikipedia_page_entity_id(title)
if entity_id:
    print(f"The Wikipedia page '{title}' exists in Wikidata with entity ID '{entity_id}'.")
else:
    print(f"The Wikipedia page '{title}' does not exist in Wikidata.")


True