## Preprocessing Script

In [None]:
import os
import re
import argparse
import datetime
import time


def preprocess_aol_query_log(input_dir):

    start_time = time.time()
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

    # Regular expression to find queries consisting of only special characters
    special_chars_only_pattern = re.compile(r'^[_\W\s]*$')

    total_processed_lines = 0
    total_duplicates_removed = 0
    total_special_chars_removed = 0
    total_malformed_ids_removed = 0
    total_malformed_lines_skipped = 0
    total_empty_lines_skipped = 0
    total_files = 0

    # Sets to track userIDs
    total_userids = set()
    remaining_userids = set()

    # Create output directory and sub-directory for processed files
    output_dir = f"aol_processed"
    processed_dir = os.path.join(output_dir, "processed_files")
    if not os.path.exists(processed_dir):
        os.makedirs(processed_dir)

    # Create filepaths for output files
    special_char_output_file = os.path.join(
        output_dir, "special_char_queries.txt")
    malformed_id_output_file = os.path.join(
        output_dir, "malformed_id_queries.txt")
    stats_output_file = os.path.join(output_dir, "processing_stats.txt")

    # Open miscellaneous output files for writing
    with open(special_char_output_file, 'w', encoding='utf-8') as special_char_file, \
            open(malformed_id_output_file, 'w', encoding='utf-8') as malformed_id_file, \
            open(stats_output_file, 'w', encoding='utf-8') as stats_file:

        print(f"AOL Query Log Processing - Started at {timestamp}\n")
        stats_file.write(
            f"AOL Query Log Processing - Started at {timestamp}\n")

        for filename in os.listdir(input_dir):
            if not filename.endswith('.txt'):
                continue

            total_files += 1

            # Create pathnames for input file and output file
            input_path = os.path.join(input_dir, filename)
            output_path = os.path.join(processed_dir, filename)

            print(f"Processing file {filename}")
            stats_file.write(f"Processing file {filename}\n")

            file_processed_lines = 0
            file_duplicates_removed = 0
            file_special_chars_removed = 0
            file_malformed_ids_removed = 0
            file_malformed_lines_skipped = 0
            file_empty_lines_skipped = 0

            # Open input file and create output file
            try:
                with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile:

                    next(infile)  # Skip the header

                    prev_anon_id = None
                    prev_query = None

                    for line in infile:
                        file_processed_lines += 1

                        line = line.strip()
                        if not line:
                            file_empty_lines_skipped += 1
                            continue  # Skip empty lines

                        parts = line.split('\t')
                        if len(parts) < 3:
                            file_malformed_lines_skipped += 1
                            continue  # Skip malformed lines

                        anon_id = parts[0].strip()
                        query = parts[1].strip()

                        if not anon_id.isdigit():
                            file_malformed_ids_removed += 1
                            malformed_id_file.write(
                                f"{line}\t{filename}\n")
                            # Skip malformed anonIDs
                            continue

                        total_userids.add(anon_id)

                        is_duplicate = (
                            anon_id == prev_anon_id and query == prev_query)

                        is_special_chars_only = bool(
                            special_chars_only_pattern.match(query))

                        if is_duplicate:
                            file_duplicates_removed += 1
                        elif is_special_chars_only:
                            file_special_chars_removed += 1
                            special_char_file.write(line + '\n')
                        else:
                            # Only keep a query if its unique and not only consisting of special characters.
                            # Modification: Since some rows have 3 columns of data and others 5,
                            # we remove the columns for ClickURL and ItemRank so that pandas can create a dataframe
                            # from the input data, and since we don't use them anyway.
                            outfile.write(anon_id + "\t" + query + '\n')
                            remaining_userids.add(anon_id)

                        prev_anon_id = anon_id
                        prev_query = query

                    total_processed_lines += file_processed_lines
                    total_duplicates_removed += file_duplicates_removed
                    total_special_chars_removed += file_special_chars_removed
                    total_malformed_ids_removed += file_malformed_ids_removed
                    total_malformed_lines_skipped += file_malformed_lines_skipped
                    total_empty_lines_skipped += file_empty_lines_skipped

                    file_stats = [
                        f"  - Processed: {file_processed_lines:,} queries",
                        f"  - Skipped {file_empty_lines_skipped:,} empty lines",
                        f"  - Skipped {file_malformed_lines_skipped:,} malformed lines",
                        f"  - Removed {file_malformed_ids_removed:,} queries with malformed IDs",
                        f"  - Removed {file_duplicates_removed:,} duplicate queries",
                        f"  - Removed {file_special_chars_removed:,} special-character-only queries",
                        f"  - Remaining: {file_processed_lines - file_duplicates_removed - file_special_chars_removed - file_malformed_ids_removed:,} queries\n"
                    ]

                    # Print and write the file stats
                    for stat in file_stats:
                        print(stat)
                        stats_file.write(stat + "\n")

            except Exception as e:
                error_msg = f"Error processing {filename}: {str(e)}"
                print(error_msg)
                stats_file.write(error_msg + "\n")

        remaining = total_processed_lines - total_duplicates_removed - \
            total_special_chars_removed - total_malformed_ids_removed

        summary_stats = [
            "\n" + "="*50,
            "PROCESSING COMPLETE",
            "="*50,
            f"Processed {total_files} files with {total_processed_lines:,} total queries",
            f"Skipped {total_empty_lines_skipped:,} empty lines",
            f"Skipped {total_malformed_lines_skipped:,} malformed lines",
            f"Removed {total_malformed_ids_removed:,} queries with malformed IDs",
            f"Removed {total_duplicates_removed:,} duplicate queries ({total_duplicates_removed/total_processed_lines*100:.2f}%)",
            f"Removed {total_special_chars_removed:,} special-char queries ({total_special_chars_removed/total_processed_lines*100:.2f}%)",
            f"Remaining non-duplicate, valid ID queries: {total_processed_lines - total_duplicates_removed - total_malformed_ids_removed:,} ({(total_processed_lines - total_duplicates_removed - total_malformed_ids_removed)/total_processed_lines*100:.2f}%)",
            f"Remaining non-duplicate, valid ID, non-special-char queries: {remaining:,} ({remaining/total_processed_lines*100:.2f}%)",
            f"Removed userIDs: {len(total_userids) - len(remaining_userids)}",
            f"Remaining UserIDs after processing: {len(remaining_userids)}",
            "="*50,
        ]

        for stat in summary_stats:
            print(stat)
            stats_file.write(stat + "\n")

        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"\nProcessing completed at {datetime.datetime.now()}\n")
        print(f"Elapsed time: {elapsed_time} seconds")
        stats_file.write(
            f"\nProcessing completed at {datetime.datetime.now()}\n")
        stats_file.write(f"Elapsed time: {elapsed_time} seconds")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Preprocess AOL query log files")

    parser.add_argument(
        "input_dir", help="Path to input directory containing AOL query log files")

    args = parser.parse_args()

    preprocess_aol_query_log(args.input_dir)


## Vocabulary Creation ##

In [None]:
import os
import json
import pandas as pd
from collections import Counter
import kenlm

# === Paths ===
BASE_PATH = "C:/Users/enesi/Desktop/DSV/DVK-Uppsats/aol_processed/"
INPUT_DIR = BASE_PATH + "processed_files/"
TOKENIZED_FILE_PATH = BASE_PATH + "queries_tokenized.txt"
VOCAB_DICT_PATH = BASE_PATH + "vocab_dict.json"
VOCAB_STATS_PATH = BASE_PATH + "vocab_stats.json"


def tokenize_query(query):

    # Treat '.' as a separate token
    query_with_spaced_dots = query.replace('.', ' . ')

    # Split on whitespace
    tokens = query_with_spaced_dots.split()

    return tokens

def tokenize_and_create_vocab(input_dir, vocab_size=45000):
    word_counts = Counter()

    with open(TOKENIZED_FILE_PATH, "w", encoding="utf-8") as outfile:
        print("Starting tokenization and vocabulary creation...")

        for filename in os.listdir(input_dir):
            if not filename.endswith(".txt"):
                continue

            file_path = os.path.join(input_dir, filename)
            print(f"Processing: {filename}")

            for chunk in pd.read_csv(file_path, sep='\t', names=['userID', 'query'], chunksize=100000):
                for query in chunk['query']:
                    tokens = tokenize_query(query)
                    if tokens:
                        word_counts.update(tokens)
                        outfile.write(" ".join(tokens) + "\n")

    total_tokens_counted = sum(word_counts.values())
    special_tokens = ['<OOV>']
    most_common_words = [word for word, _ in word_counts.most_common(vocab_size - len(special_tokens))]

    vocabulary = special_tokens + most_common_words
    vocab_dict = {word: idx for idx, word in enumerate(vocabulary)}
    actual_vocab_size = len(vocab_dict)
    covered_tokens_count = sum(word_counts[word] for word in most_common_words)
    coverage_percentage = (covered_tokens_count / total_tokens_counted) * 100 if total_tokens_counted > 0 else 0

    vocab_stats = {
        "Requested_Vocabulary_Size": vocab_size,
        "Actual_Vocabulary_Size": actual_vocab_size,
        "Total_Tokens_Found": total_tokens_counted,
        "Total_Unique_Tokens_Found": len(word_counts),
        "Coverage_Percentage_Of_Top_Tokens": round(coverage_percentage, 2),
        "Special_Tokens": special_tokens
    }

    with open(VOCAB_DICT_PATH, 'w', encoding='utf-8') as f:
        json.dump(vocab_dict, f)

    with open(VOCAB_STATS_PATH, 'w', encoding='utf-8') as f:
        json.dump(vocab_stats, f)

    print("✅ Tokenization complete. Saved to:", TOKENIZED_FILE_PATH)
    print("✅ Vocabulary saved to:", VOCAB_DICT_PATH)
    print("Vocabulary Stats:")
    print(json.dumps(vocab_stats, indent=4))



def get_vocabulary(query_file, vocab_size=45000):
    word_counter = Counter()

    with open(query_file, 'r', encoding='utf-8') as f:
        for line in f:
            words = line.strip().split()
            word_counter.update(words)

    vocab_dict = dict(word_counter.most_common(vocab_size))

    vocab_stats = {
        'total_words': sum(word_counter.values()),
        'unique_words': len(word_counter),
        'vocab_size': len(vocab_dict),
    }

    return vocab_dict, vocab_stats

vocab_size = 45000
tokenize_and_create_vocab(INPUT_DIR, vocab_size)


## Evaluation of n-gram model

In [None]:
import kenlm
from create_vocab_query import get_query_vocabulary as get_vocabulary


def query_level_next_prediction(model_path, eval_file, top_k=5, num_examples=10):
    model = kenlm.Model(model_path)

    # Load query stream
    with open(eval_file, 'r', encoding='utf-8') as f:
        queries = [line.strip() for line in f if line.strip()]

    print(f"🔍 Running next-query prediction on {num_examples} examples...\n")

    vocab_dict = get_vocabulary(eval_file, vocab_size=None)
    vocabulary = list(vocab_dict.keys())

    mrr_scores = []

    for i in range(len(queries) - 2):
        context = f"{queries[i]} {queries[i+1]}"  # Trigram context: 2 previous queries
        true_query = queries[i+2]

        # Score all candidate queries
        scores = {
            q: model.score(f"{context} {q}", bos=False, eos=False)
            for q in vocabulary
        }

        sorted_preds = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        top_preds = [q for q, _ in sorted_preds[:top_k]]

        # Calculate MRR for this prediction
        if true_query in top_preds:
            rank = top_preds.index(true_query) + 1
            mrr = 1.0 / rank
        else:
            rank = None
            mrr = 0.0
        mrr_scores.append(mrr)

        print(f"🔹 Example {i+1}")
        print(f"Context     : [{queries[i]}] → [{queries[i+1]}]")
        print(f"True query  : {true_query}")
        print(f"Top-{top_k} : {top_preds}")
        print(f"MRR         : {mrr:.4f}")
        print("-" * 40)

        if i + 1 == num_examples:
            break

    avg_mrr = sum(mrr_scores) / len(mrr_scores)
    print(f"\n📊 Average MRR over {len(mrr_scores)} examples: {avg_mrr:.4f}")

if __name__ == "__main__":
    model_path = "C:/Users/enesi/Desktop/DSV/DVK-Uppsats/3gram_query.binary"
    eval_file = "C:/Users/enesi/Desktop/DSV/DVK-Uppsats/data/ngram_eval.txt"
    query_level_next_prediction(model_path, eval_file)


## Resource Logger and Measuring Performance

In [None]:
import time
import numpy as np
import matplotlib.pyplot as plt
import psutil
import kenlm
from pynvml import *
from ngram_mrr_evaluation import get_rank_of_true_word
from ngram_mrr_evaluation import load_validation_data
from create_vocabulary import get_vocabulary

# Track resource usage at each step
def track_resources():
    cpu_usage = psutil.cpu_percent(interval=1)
    memory_info = psutil.virtual_memory()
    disk_info = psutil.disk_usage('/')

    try:
        nvmlInit()
        handle = nvmlDeviceGetHandleByIndex(0)  # GPU 0
        gpu_util = nvmlDeviceGetUtilizationRates(handle)
        gpu_memory = nvmlDeviceGetMemoryInfo(handle)
        gpu_info = {
            'gpu_utilization': gpu_util.gpu,
            'gpu_memory_used': gpu_memory.used,
            'gpu_memory_percent': gpu_memory.used / gpu_memory.total * 100
        }
        nvmlShutdown()
    except:
        gpu_info = None

    return {
        'cpu': cpu_usage,
        'memory_used': memory_info.used,
        'memory_percent': memory_info.percent,
        'disk_used': disk_info.used,
        'disk_percent': disk_info.percent,
        'gpu': gpu_info
    }

# Function to plot the resource usage as a table
def plot_resources_as_table(cpu_data, memory_data, disk_data, gpu_data):
    fig, ax = plt.subplots(figsize=(8, 4))
    ax.axis('off')

    time_axis = np.arange(len(cpu_data))  # Time axis for rows (number of recorded steps)


    table_data = [
        ['CPU'] + [f'{x:.2f}' for x in cpu_data],
        ['Memory'] + [f'{x:.2f}' for x in memory_data['memory_percent']],
        ['Disk'] + [f'{x:.2f}' for x in disk_data['disk_percent']],
    ]

    if gpu_data:
        table_data.append(['GPU'] + [f'{x:.2f}' for x in gpu_data])

    table = ax.table(cellText=table_data, colLabels=[f'Time {i+1}' for i in range(len(time_axis))], loc='center', cellLoc='center')

    for i, row in enumerate(table.get_celld().values()):
        if i == 0:
            row.set_facecolor('lightblue')
        elif i == 1:
            row.set_facecolor('lightgreen')
        elif i == 2:
            row.set_facecolor('lightcoral')
        elif i == 3 and gpu_data:
            row.set_facecolor('lightsalmon')

    plt.tight_layout()
    plt.show()

def evaluate_kenlm_model(models, query_file, n=5, sample_size=1000):
    validation_data = load_validation_data(query_file, n=n, sample_size=sample_size)

    vocab_dict, vocab_stats = get_vocabulary(query_file=query_file, vocab_size=45000)
    vocabulary = list(vocab_dict.keys())

    print(f"Evaluating {len(validation_data)} examples...")

    mrr_scores = {n_gram: [] for n_gram in models.keys()}
    accuracy_scores = {n_gram: [] for n_gram in models.keys()}

    start_time = time.time()

    cpu_data = []
    memory_data = {'memory_percent': []}
    disk_data = {'disk_percent': []}
    gpu_data = []

    # Evaluate for each N-gram model (2-gram, 3-gram, 4-gram, 5-gram)
    for n_gram, model in models.items():
        print(f"Evaluating {n_gram}-gram model...")

        for i, (input_text, true_word) in enumerate(validation_data):
            # Track resources during evaluation
            resources = track_resources()
            cpu_data.append(resources['cpu'])
            memory_data['memory_percent'].append(resources['memory_percent'])
            disk_data['disk_percent'].append(resources['disk_percent'])
            if resources['gpu'] is not None:
                gpu_data.append(resources['gpu'])

            rank, top_k = get_rank_of_true_word(model, input_text, true_word, vocabulary)

            if rank is None:
                mrr_scores[n_gram].append(0.0)
                accuracy_scores[n_gram].append(0)
            else:
                mrr_scores[n_gram].append(1.0 / rank)
                accuracy_scores[n_gram].append(1 if true_word in top_k else 0)

            if (i + 1) % 100 == 0:
                elapsed = time.time() - start_time
                print(f"... Evaluated {i+1}/{len(validation_data)} examples in {elapsed:.2f}s")

    # Calculate average MRR and accuracy for each model
    mrr_averages = {n_gram: np.mean(mrr_scores[n_gram]) for n_gram in mrr_scores}
    accuracy_averages = {n_gram: np.mean(accuracy_scores[n_gram]) for n_gram in accuracy_scores}

    print("Mean Reciprocal Rank (MRR):")
    for n_gram in mrr_averages:
        print(f"{n_gram}-gram: {mrr_averages[n_gram]:.4f}")

    print("\nAccuracy:")
    for n_gram in accuracy_averages:
        print(f"{n_gram}-gram: {accuracy_averages[n_gram]:.4f}")

    plot_metrics(mrr_averages, accuracy_averages)
    plot_resources_as_table(cpu_data, memory_data, disk_data, gpu_data)

# Function to plot MRR and accuracy
def plot_metrics(mrr_averages, accuracy_averages):
    fig, ax = plt.subplots(figsize=(8, 5))

    n_grams = list(mrr_averages.keys())
    mrr_values = list(mrr_averages.values())
    accuracy_values = list(accuracy_averages.values())

    bar_width = 0.35
    index = np.arange(len(n_grams))

    ax.bar(index, mrr_values, bar_width, label='MRR', color='lightblue')
    ax.bar(index + bar_width, accuracy_values, bar_width, label='Accuracy', color='lightgreen')

    ax.set_xlabel('N-gram Model')
    ax.set_ylabel('Score')
    ax.set_title('Evaluation of N-gram Models (MRR and Accuracy)')
    ax.set_xticks(index + bar_width / 2)
    ax.set_xticklabels([f'{n}-gram' for n in n_grams])
    ax.legend()

    plt.tight_layout()
    plt.show()

models = {
    2: kenlm.Model("queries_tokenized_2gram.arpa"),
    3: kenlm.Model("queries_tokenized_3gram.arpa"),
    4: kenlm.Model("queries_tokenized_4gram.arpa"),
    5: kenlm.Model("queries_tokenized_5gram.arpa"),
}

query_file = "C:/Users/enesi/Desktop/DSV/DVK-Uppsats/aol_processed/queries_tokenized.txt"
evaluate_kenlm_model(models, query_file)
