# Document retrieval

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import re

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import xml.etree.ElementTree as ET

import utils.manage_files

In [4]:
# Mount the drive shared folder to load pre-saved files
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Download / Load datasets

Run the following cell in order to load the needed datasets from the Drive shared folder.

In [5]:
!cp /content/drive/MyDrive/NLP_project/downloads.zip .
!unzip -q downloads.zip 
!rm downloads.zip

file_path_corpus = "clean_corpus.jsonl"

### The ClueWeb12 corpus
We can dowload, or open it in case we load it from Drive, the corpus that we need to use for the task.

In [3]:
url_corpus = "https://zenodo.org/record/6802592/files/touche-task2-passages-version-002.jsonl.gz?download=1"
zip_path_corpus = "corpus.jsonl.gz"
file_path_corpus = "corpus.jsonl" if file_path_corpus is None else file_path_corpus

download_corpus = utils.manage_files.DownloadFile(file_path_corpus, zip_path_corpus, url_corpus)
download_corpus()

'/content/downloads/corpus.jsonl' already present


In [4]:
corpus_df = pd.read_json(download_corpus.file_name, lines=True)
corpus_df.head()

Unnamed: 0,id,contents,chatNoirUrl
0,clueweb12-0000tw-14-21168___1,"Shuga: Love, Sex, Money MTV Shuga Home Swag Bl...",https://chatnoir.eu/cache?uuid=f338e91e-a3e9-5...
1,clueweb12-0000tw-14-21168___2,We LOVE sending #TeamShuga the exclusives. Ban...,https://chatnoir.eu/cache?uuid=f338e91e-a3e9-5...
2,clueweb12-0000tw-14-21168___3,Now take note.. because you will be seeing a w...,https://chatnoir.eu/cache?uuid=f338e91e-a3e9-5...
3,clueweb12-0000tw-22-19226___1,Sex and love: The modern matchmakers | The Eco...,https://chatnoir.eu/cache?uuid=2bf4b08d-2f65-5...
4,clueweb12-0000tw-22-19226___2,But have they? Feb 11th 2012 | from the print ...,https://chatnoir.eu/cache?uuid=2bf4b08d-2f65-5...


In [5]:
print(f"The corpus has {len(corpus_df)} elements.")

The corpus has 868655 elements.


### Topics, Quality and Relevance datasets

At this point we can download:
- the list of topics to retrieve all the titles;
- the relevance qrels file;
- the quality qrels file.

The last 2 files contain a relevance and quality scores, associated to a list of documents, with respect to a certain topic.

The structure of a .qrels file is:   
TOPIC 0 DOC_ID SCORE

We can use these data to train our document retrieval model on labeled data from past years.

#### Topics

In [None]:
# Download and parse the xml file of the topics
url_topics_50 = "https://zenodo.org/record/6873559/files/topics-task-2.zip?download=1"
zip_path_topics_50 = "topics-task-2-50.zip"
file_path_topics_50 = "topics-task-2-50"

url_topics_100 = "https://zenodo.org/record/6873565/files/topics-task-2-2021.zip?download=1"
zip_path_topics_100 = "topics-task-2-100.zip"
file_path_topics_100 = "topics-task-2-100"

download_topics_50 = utils.manage_files.DownloadFile(file_path_topics_50, zip_path_topics_50, url_topics_50)
download_topics_50()

download_topics_100 = utils.manage_files.DownloadFile(file_path_topics_100, zip_path_topics_100, url_topics_100)
download_topics_100()

# First 50 topics
mytree = ET.parse(f"{download_topics_50.file_name}/topics-task-2.xml")
myroot = mytree.getroot()
topics = list()
for item in myroot:
    for x in item:        
        if x.tag == "title": # Specify the field, e.g., title
            topics.append(x.text.strip())

# Last 50 topics
mytree = ET.parse(f"{download_topics_100.file_name}/topics-task2-51-100.xml")
myroot = mytree.getroot()
for item in myroot:
    for x in item:        
        if x.tag == "title": # Specify the field, e.g., title
            topics.append(x.text.strip())

'/content/downloads/topics-task-2-50' already present
'/content/downloads/topics-task-2-100' already present


In [None]:
# We have the 50 topics pre-selected from the team
print(f"There are {len(topics)} topics.\n{topics}")

There are 100 topics.
['What is the difference between sex and love?', 'Which is better, a laptop or a desktop?', 'Which is better, Canon or Nikon?', 'What are the best dish detergents?', 'What are the best cities to live in?', 'What is the longest river in the U.S.?', 'Which is healthiest: coffee, green tea or black tea and why?', 'What are the advantages and disadvantages of PHP over Python and vice versa?', 'Why is Linux better than Windows?', 'How to sleep better?', 'Should I buy an LCD TV or a plasma TV?', 'Train or plane? Which is the better choice?', 'What is the highest mountain on Earth?', 'Should one prefer Chinese medicine or Western medicine?', 'What are the best washing machine brands?', 'Should I buy or rent?', 'Do you prefer cats or dogs, and why?', 'What is the better way to grill outdoors: gas or charcoal?', 'Which is better, MAC or PC?', 'What is better: to use a brush or a sponge?', 'Which is better, Linux or Microsoft?', 'Which is better, Pepsi or Coke?', 'What is b

#### Documents relevance for each topic

In [None]:
# Download relevance qrels first 50 topics
url_relevance_50 = "https://zenodo.org/record/6873567/files/touche-task2-2022-relevance.qrels?download=1"
file_path_rel_50 = "relevance-50.qrels"

download_relevance_50 = utils.manage_files.DownloadFile(file_path_rel_50, url=url_relevance_50)
download_relevance_50()

# Download relevance qrels last 50 topics
url_relevance_100 = "https://zenodo.org/record/6873565/files/touche-task2-51-100-relevance.qrels?download=1"
file_path_rel_100 = "relevance-100.qrels"

download_relevance_100 = utils.manage_files.DownloadFile(file_path_rel_100, url=url_relevance_100)
download_relevance_100()

'/content/downloads/relevance-50.qrels' already present
'/content/downloads/relevance-100.qrels' already present


In [None]:
rel_1 = pd.read_csv(download_relevance_50.file_name, index_col=None, 
                    names=["topic", "0", "doc_id", "relevance"], sep=" ")
rel_2 = pd.read_csv(download_relevance_100.file_name, index_col=None, 
                    names=["topic", "0", "doc_id", "relevance"], sep=" ")

relevance_df = pd.concat([rel_1, rel_2], axis=0, ignore_index=True) \
                .drop_duplicates('doc_id') \
                .reset_index(drop=True) \
                .drop('0', axis=1)

relevance_df.head()

Unnamed: 0,topic,doc_id,relevance
0,12,clueweb12-0002wb-18-34442___2,0
1,12,clueweb12-0004wb-69-30215___112,0
2,12,clueweb12-0004wb-78-20304___1,1
3,12,clueweb12-0004wb-78-20304___11,2
4,12,clueweb12-0008wb-62-05967___1,0


#### Documents quality for each topic

In [None]:
# Download relevance qrels first 50 topics
url_quality_50 = "https://zenodo.org/record/6873567/files/touche-task2-2022-quality.qrels?download=1"
file_path_qual_50 = "quality-50.qrels"

download_quality_50 = utils.manage_files.DownloadFile(file_path_qual_50, url=url_quality_50)
download_quality_50()

# Download relevance qrels first 50 topics
url_quality_100 = "https://zenodo.org/record/6873565/files/touche-task2-51-100-quality.qrels?download=1"
file_path_qual_100 = "quality-100.qrels"

download_quality_100 = utils.manage_files.DownloadFile(file_path_qual_100, url=url_quality_100)
download_quality_100()

'/content/downloads/quality-50.qrels' already present
'/content/downloads/quality-100.qrels' already present


In [None]:
qual_1 = pd.read_csv(download_quality_50.file_name, index_col=None, 
                    names=["topic", "0", "doc_id", "quality"], sep=" ")
qual_2 = pd.read_csv(download_quality_100.file_name, index_col=None, 
                    names=["topic", "0", "doc_id", "quality"], sep=" ")

quality_df = pd.concat([qual_1, qual_2], axis=0, ignore_index=True) \
                .drop_duplicates('doc_id') \
                .reset_index(drop=True) \
                .drop('0', axis=1)

quality_df.head()

Unnamed: 0,topic,doc_id,quality
0,12,clueweb12-0002wb-18-34442___2,2
1,12,clueweb12-0004wb-69-30215___112,2
2,12,clueweb12-0004wb-78-20304___1,2
3,12,clueweb12-0004wb-78-20304___11,2
4,12,clueweb12-0008wb-62-05967___1,0


#### Merge data

Now we want to merge the data in order to have the relevance and quality score in the same dataframe.

In [None]:
det_df = relevance_df.merge(quality_df, on=['doc_id', 'topic'])
det_df.head()

Unnamed: 0,topic,doc_id,relevance,quality
0,12,clueweb12-0002wb-18-34442___2,0,2
1,12,clueweb12-0004wb-69-30215___112,0,2
2,12,clueweb12-0004wb-78-20304___1,1,2
3,12,clueweb12-0004wb-78-20304___11,2,2
4,12,clueweb12-0008wb-62-05967___1,0,0


In [None]:
def retrieve_by_topic_relevance(topic, relevance, det_df, corp_df):
    print(f"Topic: {topic} - Relevance: {relevance}")
    id_list = det_df[(det_df['topic']==topic) & (det_df['relevance']==relevance)]['doc_id']
    url_list = []
    for doc in id_list:
        if doc in corp_df.id.values:
            url_list.append(corp_df[corp_df['id']==doc]['chatNoirUrl'].item()) 
    return url_list

In [None]:
# List of topics taken from qrels files.
np.sort(det_df.topic.unique())

array([  2,   3,   8,   9,  12,  14,  17,  18,  19,  22,  23,  25,  26,
        27,  28,  30,  33,  34,  36,  37,  42,  43,  48,  51,  52,  53,
        54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,
        67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
        80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,
        93,  94,  95,  96,  97,  98,  99, 100])

In this example we retrieve the documents that discuss about chiropractic therapy or physical one and they are perfectly related to this argument.

In [None]:
example_topic = 55
retrieve_by_topic_relevance(example_topic, 2, det_df, corpus_df)

Topic: 55 - Relevance: 2


['https://chatnoir.eu/cache?uuid=00c926b3-aea6-5676-a05b-631183bc16d6&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=3ba792cf-65e0-50a7-8d2a-29dfe3450844&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=0cbebae4-a185-5630-9bbf-9df00049ff6d&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=0cbebae4-a185-5630-9bbf-9df00049ff6d&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=0cbebae4-a185-5630-9bbf-9df00049ff6d&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=0cbebae4-a185-5630-9bbf-9df00049ff6d&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=d09e9ab3-a8b4-5800-b926-52ae7e625e4c&index=cw12&raw&plain']

In [None]:
topics[example_topic-1]

'What is better for back pain, chiropractic therapy or physical therapy?'

## Documents pre-processing

It's important to perform some pre-processing in order to capture better the similarities between documents and removing the noise from them.

In [None]:
!pip install contractions -q 
!pip install spacy -q

In [36]:
import contractions
import spacy

import string
# To enable progress bar in apply function
from tqdm.notebook import tqdm
tqdm.pandas()

nlp = spacy.load("en_core_web_sm", disable=['ner','parser'])
nlp.max_length=5000000

### Clean text

In the following cells we decided to clean the documents in this way:
1. Make the documents lowercase.
2. Expand contractions.
3. Remove words with numbers inside.
4. Replace \n, characters that are not in the english alphabet and punctuation with a space.
5. Remove adjacent spaces.
6. Remove URLs and stopwords.
7. Perform lemmatization.

In [44]:
# Clean the documents performing pre-processing
def clean_documents(text, nlp):
    clean = text.lower()
    clean = contractions.fix(clean)
    clean = clean.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
    clean = re.sub('\w*\d\w*','', clean)
    clean = re.sub('\n',' ', clean)
    clean = re.sub(r"https?:/*\S+", "", clean)
    clean = re.sub('[^a-z]',' ', clean)
    clean = re.sub(' +',' ', clean)

    clean = ' '.join([token.lemma_ for token in list(nlp(clean)) if not token.is_stop])
    return clean

In [29]:
corpus_df['contents'] = corpus_df['contents'].progress_apply(clean_documents, nlp=nlp)

  0%|          | 0/868655 [00:00<?, ?it/s]

  0%|          | 0/868655 [00:00<?, ?it/s]

  0%|          | 0/868655 [00:00<?, ?it/s]

## Indexes creation

Given that we are not able to get an API key for the ChatNoir system, we have to create our own index for ranking the documents with respect to a certain query.

We decided to use the pyserini library, but it doesn't support the creation of a dense index for a custom dataset, therefore we used also the autofaiss library. 

In [None]:
!pip install -q pyserini
!pip install -q faiss-cpu==1.7.2

In [7]:
#@title ###The class for a sparse or dense index
#@markdown In this cell there is the class to represent a dense or sparse index
#@markdown and the relative function to make a search on it given a query.
import shutil

import faiss
from pyserini.search import LuceneSearcher, TctColBertQueryEncoder

class DocumentsIndex:
    allowed_zip_format = ['zip', 'tar', 'tar.gz']

    def __init__(self, index_path: str, index_type: str) -> None:
        if self.__path_analysis(index_path):
            self.index_path = index_path
            self.index_type = index_type if index_type in ['dense', 'sparse'] else 'sparse'
            self.index = None

            # Only for the dense index
            if self.index_type == 'dense':
                self.enc_name = "castorini/tct_colbert-v2-hnp-msmarco"
                self.encoder = None

        self.__load_index()


    def __path_analysis(self, path: str):
        '''
            It checks if the file exists and if it is a zip it unpacks it.
            It returns False if an error occurs, otherwise True.
        '''
        if not os.path.exists(path):
            print(f"ERROR: the index doesn't exist at the following path: {path}")
            return False
        else:
            _, ext = os.path.splitext(path)
            # Unpack the file
            if ext in self.allowed_zip_format:
                print("WARNING: the input path is of a compressed file, now it will be unpacked.")
                shutil.unpack_archive(path)

        return True


    def __load_index(self):
        '''
            It actually loads the index into a variable.
        '''
        print(f"Loading the {self.index_type} index file ...")
        if self.index_type == "sparse":
            self.index = LuceneSearcher(self.index_path)
        else:
            self.index = faiss.read_index(self.index_path)
            print(f"Loading the encoder {self.enc_name} ...")
            self.encoder = TctColBertQueryEncoder(self.enc_name)

        print("\nThe process is finished correctly!\n")
                
    
    def search(self, query: str, k: int=10, verbose=False):
        if self.index_type == 'sparse':
            hits = self.index.search(query, k=k)
            hits = [(hits[i].docid, hits[i].score) for i in range(len(hits))]
            if verbose:
                self.__print_results(hits)
            return hits
        else:
            query_enc = self.encoder.encode(query)
            query_enc = np.expand_dims(query_enc, axis=0)
            distances, indices = self.index.search(query_enc, k)
            res = list(zip(indices[0], distances[0]))
            if verbose:
                self.__print_results(res)
            return res

    def __print_results(self, results):
        for i, (ind, score) in enumerate(results):
            print(f'{i+1:2} {ind:4} {score:.4f}')

In [14]:
#@title #### Create a corpus without chatNoirUrl for sparse index (execute only if you don't have pre-saved indexes)
#@markdown In this cell you save a jsonl file without the 'chatNoirUrl' attribute that is not required for creating the index.
# Save the jsonl file for creating the index
!mkdir collections
corpus_df.drop('chatNoirUrl', axis=1).to_json('collections/corpus_index.jsonl', orient="records", lines=True)

Load the pre-saved indexes from the Drive folder. For the sparse index you will find a directory 'sparse_index', for the dense one you will find a file named 'knn.index'.

In [8]:
# Load the indexes from Drive
!cp /content/drive/MyDrive/NLP_project/indexes.tar.gz .
!tar -xvf indexes.tar.gz
!rm indexes.zip

In [10]:
# Retrieve the urls from the corpus given the results of the search on the index
def retrieve_urls(corpus, hits, mode="sparse"):
    urls = []
    if mode=='sparse':
        for el in hits:
            urls.append(corpus_df[corpus_df['id'] == el[0]]['chatNoirUrl'].item())
    else:
        indices = [ind[0] for ind in hits]
        urls = corpus.iloc[indices]['chatNoirUrl'].values
    return urls

### Sparse index 

The sparse index uses the BM25 score to rank the documents with respect to a given query.

The following cell show the command that we executed to create the index the first time.

In [None]:
#@title #### Create the sparse index
#@markdown Execute this cell to create a sparse index if not pre-saved, the .jsonl corpus in this case needs to be inside 'collections' directory. 
!python -m pyserini.index.lucene --collection JsonCollection --input collections/ --index clean_indexes/sparse_index --bm25.accurate --generator DefaultLuceneDocumentGenerator --threads 2 --storePositions --storeDocvectors --storeRaw

In order to retrieve the documents from the index we can use the LuceneSearcher from pyserini and ask for the top-k documents, considering the BM25 w.r.t. a query.

In [46]:
sparse_index = DocumentsIndex("indexes/sparse_index", "sparse")

sparse_results = sparse_index.search("Coke or Pepsi?", k=10, verbose=True)

Loading the sparse index file ...

The process is finished correctly!

 1 clueweb12-1301wb-59-03888___1 11.5156
 2 clueweb12-1412wb-02-24400___2 11.2515
 3 clueweb12-1802wb-28-00297___4 11.1894
 4 clueweb12-0304wb-70-29640___3 11.1401
 5 clueweb12-1000tw-98-11983___1 11.1026
 6 clueweb12-1800tw-33-09364___3 11.0931
 7 clueweb12-0406wb-48-23319___2 11.0624
 8 clueweb12-1308wb-96-22768___3 11.0054
 9 clueweb12-1108wb-45-26706___72 11.0006
10 clueweb12-0800tw-75-14494___6 10.9743


In [42]:
retrieve_urls(corpus_df, sparse_results, "sparse")

['https://chatnoir.eu/cache?uuid=822acef3-0d2e-58cb-a9a9-62c6a3e7b9a0&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=86a38a3b-58c0-530b-a660-56e2b4bffe2a&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=4ed41510-6cca-5c6b-a35c-d00d267a7c61&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=049caace-9ff8-5c8d-aa32-30b10e9a5889&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=51b9cf13-71b3-51c1-9847-fbddd4bc53cd&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=ce4a164e-d16b-5eb4-9316-2f0bab2b67b1&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=aacde075-d746-5462-8ca9-7a64b7135d3b&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=c4bf5091-a6db-5e5d-9ed6-43328bd6451c&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=d90b1021-15fc-5306-8804-37847ff52a08&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=d243c30b-1dd6-5f8a-bc42-501eea6a2545&index=cw12&raw&plain']

### Dense index

In [None]:
!pip install -q autofaiss
!pip install -q tqdm

First of all we need to compute an embedding vector for the content of each document, then we compute the knn index considering the inner product as score.

#### Compute the embeddings and create the index

You can compute the embeddings for each document considering a pre-trained version of ColBERT on MSMARCO, or you can load it from a jsonl file that we saved after the first computation.

In [15]:
# Compute the embedding
!python -m pyserini.encode input --corpus collections/corpus_index.jsonl --fields text --shard-id 0 --shard-num 1 output --embeddings embeddings/ encoder --encoder castorini/tct_colbert-v2-hnp-msmarco --fields text --batch 32 --fp16

Downloading: 100% 559/559 [00:00<00:00, 988kB/s]
Downloading: 100% 438M/438M [00:06<00:00, 70.0MB/s]
Downloading: 100% 334/334 [00:00<00:00, 473kB/s]
Downloading: 100% 232k/232k [00:00<00:00, 29.2MB/s]
Downloading: 100% 112/112 [00:00<00:00, 200kB/s]
868655it [00:05, 153128.27it/s]
100% 27146/27146 [56:53<00:00,  7.95it/s]


In [None]:
# Load the embeddings of the documents
!cp /content/drive/MyDrive/NLP_project/embeddings.jsonl.gz .
!gzip -d embeddings.jsonl.gz 

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


After the loading of the file with the documents embedding we should transform the vectors to npy files in order to create the faiss index with autofaiss library. To avoid running out of RAM we decided to read the big file with the embeddings in chunks and then we saved different numpy files that we will use to create the index. 

In [18]:
import tqdm

def convert_to_npy(path, file_len, chunksize=75000):
    '''
        It takes as input a .jsonl file and it creates some .npy files taking
        only the 'vector' key, that is the embedding of the documents.
    '''
    steps = file_len//chunksize
    for i, chunk in enumerate(tqdm.tqdm(pd.read_json(path, lines=True, chunksize=chunksize), total=steps)):
        npy_list = []
        for vect in chunk['vector'].to_numpy():
            npy_list.append(vect)

        # Save different files to avoid RAM consumption
        np.save(f'embeddings/embeddings_{i+10}.npy', np.array(npy_list))
        del npy_list

In [19]:
file_len = 868655
convert_to_npy('embeddings.jsonl', file_len)

12it [04:27, 22.31s/it]


Given the .npy files with the embeddings autofaiss automatically generates a dense index for us (~ 1 hour).

In [None]:
from autofaiss import build_index

# Load the .npy files from the "embeddings" directory where we saved them
build_index(embeddings="embeddings", index_path="indexes/knn.index",
            index_infos_path="indexes/dense_index_infos.json", max_index_memory_usage="6GB",
            current_memory_available="9GB")

Once we computed the dense index we saved on Drive such that we can load it and execute our search.

#### Test the dense index

To compute the embedding of the vectors I used '*castorini/tct_colbert-v2-hnp-msmarco*' pre-trained model thus we use it also for encoding the queries that we need to look for.

In [25]:
dense_index = DocumentsIndex('indexes/knn.index', 'dense')

Loading the dense index file ...
Loading the encoder castorini/tct_colbert-v2-hnp-msmarco ...

The process is finished correctly!



In [26]:
dense_query = "Coke or Pepsi?"
#dense_query = clean_documents(dense_query, nlp)
dense_results = dense_index.search(dense_query, k=10, verbose=True)

 1 163681 78.5340
 2 163014 78.4952
 3 163013 78.4652
 4 161391 78.4319
 5 163728 78.3653
 6 161807 78.3526
 7 163194 78.3127
 8 163377 78.2972
 9 160718 78.2814
10 164633 78.2748


In [41]:
retrieve_urls(corpus_df, dense_results, "dense")

array(['https://chatnoir.eu/cache?uuid=f6c6581d-d33a-5df3-b550-25e89c968fa1&index=cw12&raw&plain',
       'https://chatnoir.eu/cache?uuid=6a10412b-c6ad-5791-ad42-a7b4875ff934&index=cw12&raw&plain',
       'https://chatnoir.eu/cache?uuid=51b9cf13-71b3-51c1-9847-fbddd4bc53cd&index=cw12&raw&plain',
       'https://chatnoir.eu/cache?uuid=d90b1021-15fc-5306-8804-37847ff52a08&index=cw12&raw&plain',
       'https://chatnoir.eu/cache?uuid=86ddd859-1354-5688-baf2-3738893c3dba&index=cw12&raw&plain',
       'https://chatnoir.eu/cache?uuid=a5f4db90-c874-501a-a81c-953aae61520d&index=cw12&raw&plain',
       'https://chatnoir.eu/cache?uuid=cd76c6d7-1fda-5f84-9ce1-b416b1e14f18&index=cw12&raw&plain',
       'https://chatnoir.eu/cache?uuid=29f3b88a-00ff-568e-b6bf-c63061fc10bb&index=cw12&raw&plain',
       'https://chatnoir.eu/cache?uuid=6ee771fb-b1f9-5f9a-a1b5-bcbb3815190e&index=cw12&raw&plain',
       'https://chatnoir.eu/cache?uuid=d90b1021-15fc-5306-8804-37847ff52a08&index=cw12&raw&plain'],
      dty