# TEAM MAM Inference Notebook

## Depends

1. On dis1-preprocess kaggle input which contains our prefitted models.
2. On our GitHub reposity which contains the source code.

## Loading Source

We first load our source code from our GitHub repository and then we install it as a library.

Note: The token is used for the time that our repository was private. It can be ignored now since the repo is public.

In [1]:
!git clone https://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@github.com/madhueb/DIS_project1.git

Cloning into 'DIS_project1'...
remote: Enumerating objects: 1411, done.[K
remote: Counting objects: 100% (96/96), done.[K
remote: Compressing objects: 100% (83/83), done.[K
remote: Total 1411 (delta 38), reused 40 (delta 13), pack-reused 1315 (from 1)[K
Receiving objects: 100% (1411/1411), 271.61 KiB | 7.99 MiB/s, done.
Resolving deltas: 100% (900/900), done.


In [2]:
import os
os.chdir('./DIS_project1')
!pip install camel-tools
!camel_data -i disambig-mle-calima-msa-r13
!pip install -e .

Collecting camel-tools
  Downloading camel_tools-1.5.5-py3-none-any.whl.metadata (10 kB)
Collecting transformers<4.44.0,>=4.0 (from camel-tools)
  Downloading transformers-4.43.4-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting editdistance (from camel-tools)
  Downloading editdistance-0.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Collecting pyrsistent (from camel-tools)
  Downloading pyrsistent-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting muddler (from camel-tools)
  Downloading muddler-0.1.3-py3-none-any.whl.metadata (7.5 kB)
Collecting camel-kenlm>=2024.5.6 (from camel-tools)
  Downloading camel-kenlm-2024.5.6.zip (556 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m556.4/556.4 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing buil

In [3]:
import argparse
import gc
import json
import pickle
from pathlib import Path

import numpy as np
import pandas as pd

## Loading the Data and Prefitted Models

In [4]:
is_test = True
query_path = '/kaggle/input/dis-project-1-document-retrieval'
tokens_path = '/kaggle/input/dis1-preprocess/doc_tokens'
models_path = '/kaggle/input/dis1-preprocess/models'
doc_ids_path = '/kaggle/input/dis1-preprocess/ids_dict.json'
out_path = '/kaggle/working/'
LANGS = ["fr", "de", "it", "es", "ar", "ko", "en"]

from src.bm25_tfidf.text_tokenizer import (
    FrenchTokenizer,
    EnglishTokenizer,
    GermanTokenizer,
    ItalianTokenizer,
    SpanishTokenizer,
    ArabicTokenizer,
    KoreanTokenizer
)

tokenizers = {"fr": FrenchTokenizer(), "de": GermanTokenizer(), "it": ItalianTokenizer(), "es": SpanishTokenizer(),
              "ar": ArabicTokenizer(), "ko": KoreanTokenizer(), "en": EnglishTokenizer()}


# load doc ids dict with json
with open(doc_ids_path, "r") as f:
    ids_dict = json.load(f)

for lang in LANGS:
    ids_dict[lang] = np.array(ids_dict[lang])

mode = 'bm25_tfidf'

Collecting fr-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.8.0/fr_core_news_sm-3.8.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fr-core-news-sm
Successfully installed fr-core-news-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting de-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.8.0/de_core_news_sm-3.8.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

Depending on the running mode, we load dev or test data. The model will be evaluated if you decide the dev set to be loaded.

In [5]:
if is_test:
    queries = pd.read_csv(f'{query_path}/test.csv')
else:
    queries = pd.read_csv(f'{query_path}/dev.csv')

## BM25

We predict the related document for each query for BM25 model here.

In [6]:
bm25s = {}

for lang in LANGS:
    with open(f"{models_path}/bm25_{lang}.pkl", "rb") as f:
        bm25s[lang] = pickle.load(f)


ls = [[] for _ in range(len(queries))]
queries["bm25_docids"] = ls
for lang in LANGS:
    if is_test:
        queries_lang = queries[queries["lang"] == lang][["query"]].reset_index(drop=True)
    else:
        queries_lang = queries[queries["lang"] == lang][["query", "positive_docs"]].reset_index(drop=True)
    tokens = tokenizers[lang].tokenize(queries_lang["query"].tolist())
    bm25_ind = bm25s[lang]
    doc_ids = []
    for tokenized_query in tokens:
        indices, _ = bm25_ind.match(tokenized_query, k=10)
        doc_ids.append(ids_dict[lang][indices].tolist())

    queries.loc[queries["lang"] == lang, "bm25_docids"] = pd.Series(doc_ids, index=queries.loc[queries["lang"] == lang].index)
    if not is_test:
        acc = 0
        for i, row in queries_lang.iterrows():
            if row["positive_docs"] in doc_ids[i]:
                acc += 1
        print(f"Accuracy for {lang} : {acc / len(queries_lang)}")
    gc.collect()

if not is_test:
    acc = 0
    for i, row in queries.iterrows():
        if row["positive_docs"] in row["bm25_docids"]:
            acc += 1
    print(f"Accuracy for all : {acc / len(queries)}")

if mode == 'bm25' and is_test:
    queries.rename(columns={'bm25_docids': 'docids'}, inplace=True)
    queries = queries[["id", "docids"]]
    queries.to_csv(f"{out_path}/submission.csv", index=False)
    print('submission created')
    exit(0)
if mode == 'bm25':
    exit(0)

del bm25s
gc.collect()

200it [00:01, 102.51it/s]
200it [00:01, 164.39it/s]
200it [00:01, 169.75it/s]
200it [00:01, 149.32it/s]
200it [00:01, 165.63it/s]
800it [00:03, 228.00it/s]


0

## TF-IDF

We predict the related document for each query for TF-IDF model here.

In [7]:
tfidfs = {}
for lang in LANGS:
    with open(f"{models_path}/tfidf_{lang}.pkl", "rb") as f:
        tfidfs[lang] = pickle.load(f)

ls = [[] for _ in range(len(queries))]
queries["tfidf_docids"] = ls
for lang in LANGS:
    if is_test:
        queries_lang = queries[queries["lang"] == lang][["query"]].reset_index(drop=True)
    else:
        queries_lang = queries[queries["lang"] == lang][["query", "positive_docs"]].reset_index(drop=True)
    
    tokens = tokenizers[lang].tokenize([query for query in queries_lang["query"].tolist()])
    ids_ = tfidfs[lang].retrieve_top_k(tokens, k=10)
    doc_ids = [ids_dict[lang][doc_id].tolist() for doc_id in ids_]
    queries.loc[queries["lang"] == lang, "tfidf_docids"] = pd.Series(doc_ids,
                                                               index=queries.loc[queries["lang"] == lang].index)

    if not is_test:
        acc = 0
        for i, row in queries_lang.iterrows():
            if row["positive_docs"] in doc_ids[i]:
                acc += 1
        print(f"Accuracy for {lang} : {acc / len(queries_lang)}")
    gc.collect()
if not is_test:
    acc = 0
    for i, row in queries.iterrows():
        if row["positive_docs"] in row["tfidf_docids"]:
            acc += 1
    print(f"Accuracy for all : {acc / len(queries)}")

if mode == 'tfidf' and is_test:
    queries.rename(columns={'tfidf_docids': 'docids'}, inplace=True)
    queries = queries[["id", "docids"]]
    queries.to_csv(f"{out_path}/submission.csv", index=False)
    print('submission created')
    exit(0)

if mode == 'tfidf':
    exit(0)

del tfidfs
gc.collect()

200it [00:02, 96.02it/s] 
200it [00:00, 6830.34it/s]
200it [00:01, 153.12it/s]
200it [00:00, 7325.08it/s]
200it [00:01, 155.45it/s]
200it [00:00, 12905.55it/s]
200it [00:01, 143.14it/s]
200it [00:00, 13488.89it/s]
200it [00:00, 22880.93it/s]
200it [00:01, 156.58it/s]
200it [00:00, 17657.63it/s]
800it [00:03, 228.50it/s]
800it [00:00, 31501.78it/s]


0

## Ensemble Model

We predict the related document for each query for ensemble model here.

We use pretuned portion of contribution for combining the models. It can be tuned using `is_tune` argument.

In [8]:
ls = [[] for _ in range(len(queries))]
queries["docids"] = ls

k = 10

is_tune = False

bm25_ind_doc_ids = queries['bm25_docids'].tolist()

tfidf_doc_ids = queries['tfidf_docids'].tolist()
if is_tune:
    langs_kb = {lang: [] for lang in LANGS}
    for k_b in range(k+1):
        print(f'dev on k_b {k_b}')
        doc_ids = [] 
        for i in range(len(bm25_ind_doc_ids)):
            docid = bm25_ind_doc_ids[i][:k_b]
            for rec in tfidf_doc_ids[i]:
                if len(docid) == k:
                    break
                if rec not in docid:
                    docid.append(rec)
            l = k_b
            while len(docid) < k:
                if bm25_ind_doc_ids[i][l] not in docid:
                    docid.append(bm25_ind_doc_ids[i][l])
                l += 1
            doc_ids.append(docid)
        
        queries["docids"] = pd.Series(doc_ids)
        
        for lang in LANGS:
            queries_lang = queries[queries["lang"] == lang][["query", "positive_docs", "docids"]].reset_index(drop=True)
            acc = 0
            for i, row in queries_lang.iterrows():
                if row["positive_docs"] in row["docids"]:
                    acc += 1
            print(f"Accuracy for {lang} : {acc / len(queries_lang)}")
            langs_kb[lang].append(acc / len(queries_lang))
        acc = 0
        for i, row in queries.iterrows():
            if row["positive_docs"] in row["docids"]:
                acc += 1
        print(f"Accuracy for all : {acc / len(queries)}")
        gc.collect()
    for lang in LANGS:
        lang_arr = np.array(langs_kb[lang])
        max_value = np.max(lang_arr)
        ind_ = np.where(lang_arr == max_value)[0]
        print(f'kb amx for {lang}: {ind_[-1]}')

else:
    
    doc_ids = [] 
    kbs = {'fr': 9,
            'de': 10,
            'it': 10,
            'es': 8,
            'ar': 9,
            'ko': 8,
            'en': 8
          }
    for i in range(len(bm25_ind_doc_ids)):
        k_b = kbs[queries.iloc[i]['lang']]
        docid = bm25_ind_doc_ids[i][:k_b]
        for rec in tfidf_doc_ids[i]:
            if len(docid) == k:
                break
            if rec not in docid:
                docid.append(rec)
        l = k_b
        while len(docid) < k:
            if bm25_ind_doc_ids[i][l] not in docid:
                docid.append(bm25_ind_doc_ids[i][l])
            l += 1
        doc_ids.append(docid)
    queries["docids"] = pd.Series(doc_ids)
    if is_test:
        queries = queries[["id", "docids"]]
        queries.to_csv(f"{out_path}/submission.csv", index=False)
        print('submission created')
    else:
        for lang in LANGS:
            queries_lang = queries[queries["lang"] == lang][["query", "positive_docs", "docids"]].reset_index(drop=True)
            acc = 0
            for i, row in queries_lang.iterrows():
                if row["positive_docs"] in row["docids"]:
                    acc += 1
            print(f"Accuracy for {lang} : {acc / len(queries_lang)}")
        acc = 0
        for i, row in queries.iterrows():
            if row["positive_docs"] in row["docids"]:
                acc += 1
        print(f"Accuracy for all : {acc / len(queries)}")
        gc.collect()


submission created
