# Search Engine Group 4

## Imports and data loading

In [14]:
FILEPATH = "datascience.stackexchange.com" # path to unzipped data

import pandas as pd
import numpy as np
import pickle as pkl
from math import log10,sqrt
import re
import tkinter as tk
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')

def extract_data(filepath):
    return pd.read_xml(filepath, parser="etree", encoding="utf8")

[nltk_data] Downloading package stopwords to /home/himmi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
# posts = extract_data(filepath=FILEPATH+"/Posts.xml") # precomputed tokenization in file posts.pkl
users = extract_data(filepath=FILEPATH+"/Users.xml") # usefull for user reputation
comments = extract_data(filepath=FILEPATH+"/Comments.xml") # usefull for user reputation and accepted answer
votes = extract_data(filepath=FILEPATH+"/Votes.xml") # usefull for post score
badges = extract_data(filepath=FILEPATH+"/Badges.xml") # usefull for user reputation
file = "posts.pkl"
with open(file, 'rb') as f:
    posts = pkl.load(f)

print(posts.head())


       Id  PostTypeId             CreationDate  Score  ViewCount  \
0  115535           1  2022-10-24T12:58:24.757      1       29.0   
1  115536           1  2022-10-24T13:45:55.820      0       30.0   
2  115537           1  2022-10-24T13:56:40.603      0       12.0   
3  115538           2  2022-10-24T14:36:39.480      1        NaN   
4  115539           1  2022-10-24T15:22:37.823      0       14.0   

                                                Body  OwnerUserId  \
0  [pa, in, advance, if, this, question, is, so, ...      30838.0   
1  [pi, just, need, to, check, my, understanding,...      87037.0   
2  [pi, am, trying, to, tune, gradient, boost, ca...      64199.0   
3  [p, it, is, correct, if, you, compare, neural,...     119140.0   
4  [pi, starting, to, study, how, to, rank, words...     141937.0   

          LastActivityDate                                              Title  \
0  2022-10-24T12:58:24.757  [information, retrieval, vs, recommendation, s...   
1  2022-10-24T

In [17]:
def get_inverted_index(posts,column): # returns inverted index
    inverted_index = {}
    for i in range(len(posts)):
        post_id = str(posts['Id'][i])
        for word in posts[column][i]:
            if word not in inverted_index:
                inverted_index[word] = {}
            if post_id not in inverted_index[word]:
                inverted_index[word][post_id] = 0
            inverted_index[word][post_id] = inverted_index[word][post_id] + 1
    return inverted_index
  
inverted_index_body = get_inverted_index(posts,"Body")
inverted_index_title = get_inverted_index(posts,"Title")

In [18]:
# remove stopwords
stop_words_nltk = stopwords.words('english') # list of stopwords
stop_words = set(stop_words_nltk + ["p"])
print(stop_words)

def remove_stopwords(inverted_index):
    for word in stop_words:
        if word in inverted_index:
            del inverted_index[word]
    return inverted_index

inverted_index_body = remove_stopwords(inverted_index_body)
inverted_index_title = remove_stopwords(inverted_index_title)

{'doing', 'where', 'this', 'again', 'other', 'didn', 'each', 'yours', 'before', 'mustn', 'about', 'if', 'be', 'nor', 'not', "isn't", 'll', 'up', 'through', 'we', 'her', "didn't", "hasn't", 're', 'is', 'don', 'same', 'all', 'isn', 'these', 'until', 'him', 'its', 'd', 'above', 'some', 've', 'do', 'who', 'into', 'my', 'does', 'have', 'aren', 'myself', 'of', 'them', 'then', 'so', 'they', "it's", 'will', 'haven', 'most', "you're", "you'd", "weren't", 'against', 'between', 'because', 'he', 'shouldn', "that'll", 'it', 'below', "haven't", 'his', 'both', 'any', 'wasn', 'for', 'no', 'doesn', 'can', 'with', 'than', 'o', 'weren', "wouldn't", 'our', 'once', 'are', "aren't", 'while', 'an', 'own', 'when', 'whom', 'very', 'themselves', 'only', 'ma', 'theirs', 'by', 'had', 'or', 'at', 'after', 'that', 'further', 'there', 't', 'yourself', 'over', 'itself', "you'll", "don't", 'she', 'having', 'am', "you've", 'being', 'to', 'why', 'during', "mightn't", 'from', "shouldn't", 'won', 'the', 'your', 'hers', 'n

In [19]:
def get_tf_idf(inverted_index, posts): # returns tf_idf
    tf_idf = {}
    for word in inverted_index:
        tf_idf[word] = {}
        for post_id in inverted_index[word]:
            tf_idf[word][post_id] = inverted_index[word][post_id] * log10(len(posts)/len(inverted_index[word]))
    return tf_idf

tf_idf_body = get_tf_idf(inverted_index_body, posts)
tf_idf_title = get_tf_idf(inverted_index_title, posts)


In [20]:
# posts that contain the word python the most in the body and title
print(sorted(tf_idf_body["python"].items(), key=lambda x: x[1], reverse=True)[0:5])
print(sorted(inverted_index_body["python"].items(), key=lambda x: x[1], reverse=True)[0:5])
print(sorted(tf_idf_title["python"].items(), key=lambda x: x[1], reverse=True)[0:5])
print(sorted(inverted_index_title["python"].items(), key=lambda x: x[1], reverse=True)[0:5])

[('116141', 21.661042056227643), ('339', 21.661042056227643), ('109823', 19.380932366098417), ('76841', 15.96076783090458), ('95007', 15.96076783090458)]
[('116141', 19), ('339', 19), ('109823', 17), ('76841', 14), ('95007', 14)]
[('29542', 3.6589177006247477), ('65063', 3.6589177006247477), ('11404', 3.6589177006247477), ('64403', 3.6589177006247477), ('29057', 3.6589177006247477)]
[('29542', 2), ('65063', 2), ('11404', 2), ('64403', 2), ('29057', 2)]


In [21]:
# stemmer
stemmer = PorterStemmer()

def stem_inverted_index(inverted_index):
    stemmed_inverted_index = {}
    for word in inverted_index:
        stemmed = stemmer.stem(word) # stem word
        if stemmed not in stemmed_inverted_index:
            stemmed_inverted_index[stemmed] = {}
        for post_id in inverted_index[word]:
            if post_id not in stemmed_inverted_index[stemmed]:
                stemmed_inverted_index[stemmed][post_id] = 0
            stemmed_inverted_index[stemmed][post_id] = stemmed_inverted_index[stemmed][post_id] + inverted_index[word][post_id]
    return stemmed_inverted_index

stemmed_inverted_index_body = stem_inverted_index(inverted_index_body)
stemmed_inverted_index_title = stem_inverted_index(inverted_index_title)


In [9]:
print(sorted(inverted_index_body["open"].items(), key=lambda x: x[1], reverse=True)[0:5])
print(sorted(stemmed_inverted_index_body["open"].items(), key=lambda x: x[1], reverse=True)[0:5])

[('115768', 13), ('60039', 11), ('119916', 11), ('20380', 9), ('117097', 8)]
[('76321', 54), ('74666', 25), ('115768', 13), ('60039', 11), ('119916', 11)]


In [23]:
from transformers import BertTokenizer, BertModel


def text_process(query):
    # tokenize query (or text) with bert-case-uncased tokenizer
    # keep only alpha words
    # remove stopwords
    # stem query
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    query = re.sub(r'[^\w\s]', '', query)
    query = tokenizer.tokenize(query)
    query = [word for word in query if word.isalpha()]
    query = [word for word in query if word not in stop_words]
    query = [stemmer.stem(word) for word in query]
    return query

query = "information retrieval"
query = text_process(query)
print(query) # ['error', 'open', 'file', 'python']

['inform', 'retriev']


## Getting relevant metadata from the dataset

In [25]:
def get_tags(post_id):
    taglist = posts[posts["Id"] == post_id]["Tags"]
    if taglist.values[0] is None: return []
    taglist = taglist.values[0].replace("<", "").replace(">", " ").split(" ")
    return [tag.replace("-", " ") for tag in taglist]]

def get_all_tags():
    all_tags = []
    for i in range(len(posts)):
        all_tags = all_tags + get_tags(posts["Id"][i])
    return list(set(all_tags))

# all_tags = get_all_tags()

SyntaxError: unmatched ']' (2066743062.py, line 5)

In [None]:
# print(all_tags)

['', 'regex', 'nosql', 'validation', 'odds', 'rocr-package', 'mlops', 'bayesian', 'rasa-nlu', 'perplexity', 'data-source', 'real-ml-usecase', 'randomized-algorithms', 'sequential-pattern-mining', 'google-prediction-api', 'feature-scaling', 'noise', 'colab', 'marketing', 'survival-analysis', 'python-polars', 'markov', 'bootstraping', 'uncertainty', 'jaccard-coefficient', 'fasttext', 'ibm-watson', 'cross-entropy', 'json', 'state-of-the-art', 'training', 'ai', 'ngrams', 'machine-translation', 'multitask-learning', 'probability-calibration', 'api', 'embeddings', 'web-scraping', 'octave', 'apache-kafka', 'exploratory-factor-analysis', 'normalization', 'mse', 'csv', 'catboost', 'stacked-lstm', 'bigdata', 'vgg16', 'anonymization', 'hyperparameter', 'caret', 'openai-gpt', 'flask', 'methodology', 'visualization', 'gradient', 'elastic-search', 'scoring', 'finite-precision', 'kernel', 'consumerweb', 'knowledge-base', 'nlp', 'bioinformatics', 'spatial-transformer', 'stacking', 'pandas', 'apache-pi

In [26]:
def get_body(posts, post_id):
    # tokenized body of the post
    body = posts[posts["Id"] == post_id]["Body"].values[0]
    return body

def get_title(posts, post_id):
    # tokenized title of the post
    title = posts[posts["Id"] == post_id]["Title"].values[0]
    return title

def get_tags_from_postid(post_id):
    taglist = posts[posts["Id"] == post_id]["Tags"]
    if taglist.values[0] is None: return []
    taglist = taglist.values[0].replace("<", "").replace(">", " ").split(" ")
    # replace "-"" with " "
    for i in range(len(taglist)):
        taglist[i] = text_process(taglist[i].replace("-", " "))
    return taglist[:-1]


# print(get_tags_from_postid(115768))

def get_reputation(post_id):
    # reputation of the user who posted the post
    reputation = users[users["Id"] == int(posts[posts["Id"] == post_id]["OwnerUserId"])]["Reputation"].values[0]
    return reputation

# print(get_reputation(posts, 115768))
def get_inverted_index_tags(posts,tag_to_processed_tag_dict):
    # Inverted index of tags
    inverted_index_tags = {}
    for i in range(len(posts)):
        post_id = posts["Id"][i]
        taglist = [tag_to_processed_tag_dict[tag] for tag in get_tags_from_postid(post_id)]
        taglist = [tag for tag in taglist if tag is not None]
        for tag in taglist:
            if tag not in inverted_index_tags:
                inverted_index_tags[tag] = {}
            if post_id not in inverted_index_tags[tag]:
                inverted_index_tags[tag][post_id] = 0
            inverted_index_tags[tag][post_id] = inverted_index_tags[tag][post_id] + 1


def get_votes(post_id):
    # number of votes of the post
    nb_votes = posts[posts["Id"] == post_id]["Score"].values[0]
    return nb_votes

def get_number_answers(post_id):
    # number of answers of the post
    nb_answers = posts[posts["ParentId"] == post_id].shape[0]
    return nb_answers

def get_badges_user(post_id):
    # number of badges of the user who posted the post
    # use the variable badges and users
    nb_badges = badges[badges["UserId"] == int(posts[posts["Id"] == post_id]["OwnerUserId"])]["Class"].shape[0]
    return nb_badges

def get_answered(post_id):
    # 1 if the post is answered, 0 otherwise
    # use the variable comments
    if comments[comments["PostId"] == int(post_id)].shape[0] > 0:
        return 1
    else:
        return 0

def get_views(post_id):
    # number of views of the post
    # use the variable posts
    nb_views = posts[posts["Id"] == post_id]["ViewCount"].values[0]
    return nb_views

# print(posts.head())

print(get_views(12761))

60888.0


In [None]:
# from tqdm.notebook import tqdm
# tag_to_processed_tag_dict ={}
# for tag in tqdm(all_tags):
#     tag_to_processed_tag_dict[tag] = text_process(tag)
    

  0%|          | 0/680 [00:00<?, ?it/s]

In [13]:
# # save tag_to_processed_tag_dict
# import pickle
# with open('tag_to_processed_tag_dict.pickle', 'wb') as handle:
#     pickle.dump(tag_to_processed_tag_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)$
# load tag_to_processed_tag_dict
import pickle
with open('tag_to_processed_tag_dict.pickle', 'rb') as handle:
    tag_to_processed_tag_dict = pickle.load(handle)

In [27]:
inverted_index_tags = get_inverted_index_tags(posts,tag_to_processed_tag_dict) 

TypeError: unhashable type: 'list'

In [None]:
import pickle
# save inverted index tags as pkl
with open('inverted_index_tags.pkl', 'wb') as f:
    pickle.dump(inverted_index_tags, f)

In [None]:
# load inverted index tags from pkl
with open('inverted_index_tags.pkl', 'rb') as f:
    inverted_index_tags = pickle.load(f)

In [None]:
print(list(inverted_index_tags.keys()))
# print(inverted_index_tags["optimization"])

['recommender-system', 'information-retrieval', 'ai', 'dropout', 'machine-learning', 'r', 'accuracy', 'ensemble-learning', 'metaheuristics', 'machine-learning-model', 'tfidf', 'ranking', 'learning-to-rank', 'classification', 'nlp', 'multiclass-classification', 'reinforcement-learning', 'policy-gradients', 'tableau', 'rstudio', 'binary-classification', 'bigdata', 'apache-hadoop', 'map-reduce', 'deep-learning', 'neural-network', 'convolutional-neural-network', 'mlp', 'python', 'time-series', 'churn', 'random-forest', 'supervised-learning', 'computer-vision', 'language-model', 'openai-gpt', 'deepmind', 'image-classification', 'transformer', 'huggingface', 'pretraining', 'data-mining', 'web-scraping', 'prediction', 'forecasting', 'pandas', 'data-cleaning', 'data', 'preprocessing', 'data-analysis', 'feature-engineering', 'feature-extraction', 'lstm', 'feature-scaling', 'loss-function', 'optimization', 'statistics', 'counts', 'pytorch', 'normalization', 'sql', 'databases', 'mysql', 'dataset'

## Search Engine (We name it : CobraSearch)

Our goal is to use a probabilistic model because it is well suited to an inverted index. The most used probabilistic model is the BM25 model, However, it supposes that all documents have the same prior relevance. In our case, we want to take into account the non textual metadata of the documents. \
An extensive bibliographic overview has led us to use the following model :
Our chosen model is based on the following paper: https://dl.acm.org/doi/10.1561/1500000019 Sections 3.6 and 3.7 \
It is derived from the BM25 model with 3 Streams : title, body and tags, but also taking into account non textual features like the number votes, comments, etc...


In [None]:
def term_frequency(post_id,inverted_index):
    # term frequency of the post
    # use the variable posts
    tf = {}
    for word in inverted_index:
        if str(post_id) in inverted_index[word]:
            tf[word] = inverted_index[word][str(post_id)]/len(posts[posts["Id"] == post_id]["Body"].values[0])
        else:
            tf[word] = 0
    return tf

def post_to_post_feature(post_id):
    post_feature = {}
    post_feature["post_id"] = post_id
    post_feature["tf_body"] = term_frequency(post_id, inverted_index_body)
    post_feature["tf_title"] = term_frequency(post_id, inverted_index_title)
    post_feature["tf_tags"] = term_frequency(post_id, inverted_index_tags)
    post_feature["reputation"] = get_reputation(post_id)
    post_feature["votes"] = get_votes(post_id)
    post_feature["number_answers"] = get_number_answers(post_id)
    post_feature["badges"] = get_badges_user(post_id)
    post_feature["answered"] = get_answered(post_id)
    post_feature["views"] = get_views(post_id)
    return post_feature



def CobraSearch(query):
    query_precessed = text_process(query)
    # hyperparameters
    k1 = 1.2
    b = 0.75
    k3 = 1000

    # Vi, score function of each feature
    lambda_reputation = 1
    w_reputation = 1
    lambda_votes = 1
    w_votes = 1
    lambda_nb_answers = 1
    w_nb_answers = 1
    lambda_badges = 1
    w_badges = 1
    w_answered = 1
    lambda_views = 1
    w_views = 1
    Vi = {}
    for post_id in posts["Id"]:
        post_features = post_to_post_feature(post_id)
        Vi[post_id] = {
            "reputation": w_reputation*log10(post_features["reputation"] + lambda_reputation),
            "votes": w_votes*log10(post_features["votes"] + lambda_votes),
            "number_answers": w_nb_answers*log10(post_features["number_answers"]+lambda_nb_answers),
            "badges":w_badges*log10(post_features["badges"]+lambda_badges),
            "answered":w_answered*post_features["answered"],
            "views":w_views*log10(post_features["badges"]+lambda_views),
        }
    
    
