In [1]:
import pandas as pd
import numpy as np
import random
from faker import Faker
from datetime import datetime, timedelta
import zipfile
import os

# Initialize Faker
fake = Faker()
Faker.seed(42)
np.random.seed(42)
random.seed(42)

# Parameters
n_rows = 100000
n_sessions = 50000
n_users = 10000
n_contexts = 50000
n_authors = 5000

# Categorical options
user_roles = ['Employee', 'Manager', 'Executive', 'Admin', 'Guest']
departments = ['HR', 'IT', 'Legal', 'Engineering', 'Sales', 'Marketing']
document_types = ['FAQ', 'Policy', 'Report', 'Email', 'Wiki Page', 'Meeting Notes']
languages = ['English', 'Spanish', 'French']

# Function to generate realistic text
def generate_text(min_words, max_words):
    words = fake.words(nb=random.randint(min_words, max_words))
    return ' '.join(words).capitalize() + '.'

# Generate query/session data
query_ids = np.arange(1, n_rows + 1)
session_ids = np.random.randint(1, n_sessions + 1, n_rows)
user_ids = np.random.randint(1, n_users + 1, n_rows)
user_roles_col = np.random.choice(user_roles, n_rows)
user_departments = np.random.choice(departments, n_rows)
user_tenure_years = np.random.randint(0, 21, n_rows)

# Queries
queries = [
    fake.sentence(nb_words=random.randint(4, 10)).rstrip('.') + '?'
    for _ in range(n_rows)
]
query_lengths = [len(q.split()) for q in queries]
query_timestamps = [
    fake.date_time_between(start_date='-5y', end_date='now').strftime('%Y-%m-%d %H:%M:%S')
    for _ in range(n_rows)
]

# Previous query in session
prev_queries = [''] * n_rows
session_to_queries = {}
for i, sid in enumerate(session_ids):
    if sid not in session_to_queries:
        session_to_queries[sid] = i
    else:
        prev_queries[i] = queries[session_to_queries[sid]]
        session_to_queries[sid] = i

# Context data
context_ids = np.random.randint(1, n_contexts + 1, n_rows)
context_texts = [generate_text(50, 200) for _ in range(n_rows)]
context_lengths = [len(ct.split()) for ct in context_texts]
document_types_col = np.random.choice(document_types, n_rows)
document_departments = np.random.choice(departments, n_rows)
creation_dates = [fake.date_between(start_date='-15y', end_date='today').strftime('%Y-%m-%d') for _ in range(n_rows)]
last_updated_dates = [
    (datetime.strptime(cd, '%Y-%m-%d') + timedelta(days=random.randint(0, 2000))).strftime('%Y-%m-%d')
    for cd in creation_dates
]
author_ids = np.random.randint(1, n_authors + 1, n_rows)
author_roles = np.random.choice(user_roles, n_rows)
version_numbers = np.random.randint(1, 11, n_rows)
tags_list = [
    ','.join(fake.words(nb=random.randint(3, 5)))
    for _ in range(n_rows)
]
languages_col = np.random.choice(languages, n_rows, p=[0.9, 0.05, 0.05])

# Numeric features
cosine_similarity = np.round(np.random.uniform(0, 1, n_rows), 3)
tfidf_score = np.round(np.random.uniform(0, 1, n_rows), 3)
keyword_overlap_count = np.random.randint(0, 21, n_rows)
bert_similarity = np.round(np.random.uniform(0, 1, n_rows), 3)
readability_score = np.round(np.random.uniform(0, 100, n_rows), 2)
view_count = np.random.randint(0, 1001, n_rows)
edit_count = np.random.randint(0, 51, n_rows)
click_count = np.random.randint(0, 101, n_rows)
historical_relevance_avg = np.round(np.random.uniform(0, 1, n_rows), 3)
is_multi_turn = (np.array(prev_queries) != '').astype(int)
noise_level = np.round(np.random.uniform(0, 1, n_rows), 3)
domain_specific_score = (user_departments == document_departments).astype(int) * np.round(np.random.uniform(0.7, 1, n_rows), 3)

# Relevance score
relevance_score = (
    0.4 * cosine_similarity +
    0.3 * tfidf_score +
    0.1 * (keyword_overlap_count / 20) +
    0.1 * bert_similarity -
    0.1 * noise_level +
    0.05 * domain_specific_score +
    0.05 * historical_relevance_avg
)
relevance_score = np.clip(relevance_score, 0, 1)

# Create DataFrame
df = pd.DataFrame({
    "query_id": query_ids,
    "session_id": session_ids,
    "user_id": user_ids,
    "user_role": user_roles_col,
    "user_department": user_departments,
    "user_tenure_years": user_tenure_years,
    "query_text": queries,
    "query_length": query_lengths,
    "query_timestamp": query_timestamps,
    "previous_query_text": prev_queries,
    "context_id": context_ids,
    "context_text": context_texts,
    "context_length": context_lengths,
    "document_type": document_types_col,
    "document_department": document_departments,
    "creation_date": creation_dates,
    "last_updated_date": last_updated_dates,
    "author_id": author_ids,
    "author_role": author_roles,
    "version_number": version_numbers,
    "tags": tags_list,
    "language": languages_col,
    "cosine_similarity": cosine_similarity,
    "tfidf_score": tfidf_score,
    "keyword_overlap_count": keyword_overlap_count,
    "bert_similarity": bert_similarity,
    "readability_score": readability_score,
    "view_count": view_count,
    "edit_count": edit_count,
    "click_count": click_count,
    "historical_relevance_avg": historical_relevance_avg,
    "is_multi_turn": is_multi_turn,
    "noise_level": noise_level,
    "domain_specific_score": domain_specific_score,
    "relevance_score": np.round(relevance_score, 3)
})

# Save CSV and ZIP it
csv_name = "context_relevance_dataset.csv"
zip_name = "context_relevance_dataset.zip"

df.to_csv(csv_name, index=False)
with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write(csv_name)

print(f"Dataset saved to {zip_name}")

Dataset saved to context_relevance_dataset.zip
