In [None]:
from concurrent.futures import ThreadPoolExecutor
from pyserini.index import IndexReader
import pandas as pd
from pyserini.search import LuceneSearcher
from tqdm import tqdm

#!python -m pip install "dask[distributed]" --upgrade 

In [None]:

# Set the path to your index
index_path = 'indexes/pyserini/indexes/full_index/'
seacher = LuceneSearcher(index_path)
# Instantiate an IndexReader
index_reader = IndexReader(index_path)

# Get total nu1mber of documents
total_docs = seacher.num_docs

# Define a function to get the document length
def get_doc_length(docid):
    id = index_reader.convert_internal_docid_to_collection_docid(docid)
    doc = index_reader.doc(str(id))
    doc_length = len(doc.raw().split())
    return doc_length

# Use ThreadPoolExecutor to parallelize the computation
with ThreadPoolExecutor() as executor:
    doc_lengths = list(tqdm(executor.map(get_doc_length, range(total_docs)), total=total_docs, desc="Calculating document statistics"))

doc_lengths_df = pd.DataFrame(doc_lengths, columns=["length_words"])
doc_stats = doc_lengths_df.agg({"length_words": ["min", "max", "median", "mean"]})

In [None]:
import re
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

msmarco_data = 'data/trec/fulldocs-new.trec'


def extract_docs(file):
    with open(file, 'r') as f:
        docno = url = content = None
        for line in f:
            if line.startswith('<DOCNO>'):
                docno = line.replace('<DOCNO>', '').replace('</DOCNO>', '').strip()
            elif line.startswith('<TEXT>'):
                content = []
            elif line.startswith('</TEXT>'):
                content = '\n'.join(content)
                url_match = re.search(r'https?://\S+', content)
                if url_match:
                    url = url_match.group()
                    content = content[len(url):].strip()
                else:
                    url = None
                yield {'docno': docno, 'url': url, 'content': content}
                content = None
            elif content is not None:
                content.append(line.strip())

num_docs = 0
doc_lengths = []

with open('doc_lengths.csv', 'w', newline='') as csvfile:
    fieldnames = ['doc_id', 'length']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for doc in extract_docs(msmarco_data):
        num_docs += 1
        length = len(doc['content'])
        doc_lengths.append(length)
        writer.writerow({'doc_id': doc['docno'], 'length': length})

doc_lengths = np.array(doc_lengths)


In [None]:

index_path = "indexes/pyserini/indexes/full_index"

searcher = LuceneSearcher(index_path)

total_docs = searcher.num_docs
doc_lengths = []

# Assuming you have a file with the queries
path_to_queries = "data/proc_data/train_sample/sample_queries.tsv"

# Read the queries file into a DataFrame
queries_df = pd.read_csv(path_to_queries, sep=" ", header=None, names=["query"])
queries_df["query_length"] = queries_df["query"].apply(lambda x: len(x.split()))

total_queries = len(queries_df)

query_lengths = queries_df["query_length"].values



# Create a summary table
summary_table = pd.DataFrame({
    "Statistic": ["Total", "Min", "Max", "Median", "Average"],
    "Documents": [total_docs, np.min(doc_lengths_array), np.max(doc_lengths_array), np.median(doc_lengths_array), np.mean(doc_lengths_array)],
    "Queries": [total_queries, np.min(query_lengths), np.max(query_lengths), np.median(query_lengths), np.mean(query_lengths)]
})

print(summary_table)

# Plot document length distribution

doc_lengths_array = pd.DataFrame(doc_lengths_array, columns=["doc_length"], index=range(1, total_docs+1))
doc_df = pd.DataFrame(doc_lengths_array.T, columns=["doc_length"])
docs = doc_df['doc_length'].values
plt.figure()
plt.hist(docs, bins=4 )
plt.xlabel("Document Length")
plt.ylabel("Frequency")
plt.title("Document Length Distribution")
plt.show()
# Plot query length distribution
plt.figure()
plt.hist(query_lengths, bins=10)
plt.xlabel("Query Length")
plt.ylabel("Frequency")
plt.title("Query Length Distribution")
plt.show()
docs
query_lengths
summary_table
