In [3]:
!pip install https://github.com/kpu/kenlm/archive/master.zip

Collecting https://github.com/kpu/kenlm/archive/master.zip
  Using cached https://github.com/kpu/kenlm/archive/master.zip
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
!apt-get install -y build-essential cmake libboost-all-dev zlib1g-dev libbz2-dev liblzma-dev
!git clone https://github.com/kpu/kenlm.git
%cd kenlm
!mkdir build && cd build && cmake .. && make -j4

In [4]:
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [None]:
# @title === Imports & Configuration ===

import os
import re
import numpy as np
import pandas as pd
import argparse
import csv
from datetime import datetime
import time
import random
from collections import Counter
import kenlm
import psutil
from google.colab import drive
import json
import matplotlib.pyplot as plt
import threading
import subprocess




# === Colab Google Drive Paths ===
BASE_PATH = "/content/drive/MyDrive/dvk-uppsats/"
INPUT_DIR = BASE_PATH + "aol_processed/processed_files/"
TRAIN_FILE_PATH = BASE_PATH + "data/ngram_train.txt"
EVAL_FILE_PATH = BASE_PATH + "data/ngram_eval.txt"
MODEL_PATH = "C:/Users/enesi/Desktop/DSV/DVK-Uppsats/3gram_query.binary" # Detta rad skulle ändras när vi evaluerar olika modeller
VOCAB_DICT_PATH = BASE_PATH + "data/query_vocab_dict.json"
VOCAB_STATS_PATH = BASE_PATH + "data/query_vocab_stats.json"


In [None]:
# @title === Preprocessing Script ===
def preprocess_aol_query_log(input_dir):

    start_time = time.time()
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

    # Regular expression to find queries consisting of only special characters
    special_chars_only_pattern = re.compile(r'^[_\W\s]*$')

    total_processed_lines = 0
    total_duplicates_removed = 0
    total_special_chars_removed = 0
    total_malformed_ids_removed = 0
    total_malformed_lines_skipped = 0
    total_empty_lines_skipped = 0
    total_files = 0

    # Sets to track userIDs
    total_userids = set()
    remaining_userids = set()

    # Create output directory and sub-directory for processed files
    output_dir = f"aol_processed"
    processed_dir = os.path.join(output_dir, "processed_files")
    if not os.path.exists(processed_dir):
        os.makedirs(processed_dir)

    # Create filepaths for output files
    special_char_output_file = os.path.join(
        output_dir, "special_char_queries.txt")
    malformed_id_output_file = os.path.join(
        output_dir, "malformed_id_queries.txt")
    stats_output_file = os.path.join(output_dir, "processing_stats.txt")

    # Open miscellaneous output files for writing
    with open(special_char_output_file, 'w', encoding='utf-8') as special_char_file, \
            open(malformed_id_output_file, 'w', encoding='utf-8') as malformed_id_file, \
            open(stats_output_file, 'w', encoding='utf-8') as stats_file:

        print(f"AOL Query Log Processing - Started at {timestamp}\n")
        stats_file.write(
            f"AOL Query Log Processing - Started at {timestamp}\n")

        for filename in os.listdir(input_dir):
            if not filename.endswith('.txt'):
                continue

            total_files += 1

            # Create pathnames for input file and output file
            input_path = os.path.join(input_dir, filename)
            output_path = os.path.join(processed_dir, filename)

            print(f"Processing file {filename}")
            stats_file.write(f"Processing file {filename}\n")

            file_processed_lines = 0
            file_duplicates_removed = 0
            file_special_chars_removed = 0
            file_malformed_ids_removed = 0
            file_malformed_lines_skipped = 0
            file_empty_lines_skipped = 0

            # Open input file and create output file
            try:
                with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile:

                    next(infile)  # Skip the header

                    prev_anon_id = None
                    prev_query = None

                    for line in infile:
                        file_processed_lines += 1

                        line = line.strip()
                        if not line:
                            file_empty_lines_skipped += 1
                            continue  # Skip empty lines

                        parts = line.split('\t')
                        if len(parts) < 3:
                            file_malformed_lines_skipped += 1
                            continue  # Skip malformed lines

                        anon_id = parts[0].strip()
                        query = parts[1].strip()

                        if not anon_id.isdigit():
                            file_malformed_ids_removed += 1
                            malformed_id_file.write(
                                f"{line}\t{filename}\n")
                            # Skip malformed anonIDs
                            continue

                        total_userids.add(anon_id)

                        is_duplicate = (
                            anon_id == prev_anon_id and query == prev_query)

                        is_special_chars_only = bool(
                            special_chars_only_pattern.match(query))

                        if is_duplicate:
                            file_duplicates_removed += 1
                        elif is_special_chars_only:
                            file_special_chars_removed += 1
                            special_char_file.write(line + '\n')
                        else:
                            # Only keep a query if its unique and not only consisting of special characters.
                            # Modification: Since some rows have 3 columns of data and others 5,
                            # we remove the columns for ClickURL and ItemRank so that pandas can create a dataframe
                            # from the input data, and since we don't use them anyway.
                            outfile.write(anon_id + "\t" + query + '\n')
                            remaining_userids.add(anon_id)

                        prev_anon_id = anon_id
                        prev_query = query

                    total_processed_lines += file_processed_lines
                    total_duplicates_removed += file_duplicates_removed
                    total_special_chars_removed += file_special_chars_removed
                    total_malformed_ids_removed += file_malformed_ids_removed
                    total_malformed_lines_skipped += file_malformed_lines_skipped
                    total_empty_lines_skipped += file_empty_lines_skipped

                    file_stats = [
                        f"  - Processed: {file_processed_lines:,} queries",
                        f"  - Skipped {file_empty_lines_skipped:,} empty lines",
                        f"  - Skipped {file_malformed_lines_skipped:,} malformed lines",
                        f"  - Removed {file_malformed_ids_removed:,} queries with malformed IDs",
                        f"  - Removed {file_duplicates_removed:,} duplicate queries",
                        f"  - Removed {file_special_chars_removed:,} special-character-only queries",
                        f"  - Remaining: {file_processed_lines - file_duplicates_removed - file_special_chars_removed - file_malformed_ids_removed:,} queries\n"
                    ]

                    # Print and write the file stats
                    for stat in file_stats:
                        print(stat)
                        stats_file.write(stat + "\n")

            except Exception as e:
                error_msg = f"Error processing {filename}: {str(e)}"
                print(error_msg)
                stats_file.write(error_msg + "\n")

        remaining = total_processed_lines - total_duplicates_removed - \
            total_special_chars_removed - total_malformed_ids_removed

        summary_stats = [
            "\n" + "="*50,
            "PROCESSING COMPLETE",
            "="*50,
            f"Processed {total_files} files with {total_processed_lines:,} total queries",
            f"Skipped {total_empty_lines_skipped:,} empty lines",
            f"Skipped {total_malformed_lines_skipped:,} malformed lines",
            f"Removed {total_malformed_ids_removed:,} queries with malformed IDs",
            f"Removed {total_duplicates_removed:,} duplicate queries ({total_duplicates_removed/total_processed_lines*100:.2f}%)",
            f"Removed {total_special_chars_removed:,} special-char queries ({total_special_chars_removed/total_processed_lines*100:.2f}%)",
            f"Remaining non-duplicate, valid ID queries: {total_processed_lines - total_duplicates_removed - total_malformed_ids_removed:,} ({(total_processed_lines - total_duplicates_removed - total_malformed_ids_removed)/total_processed_lines*100:.2f}%)",
            f"Remaining non-duplicate, valid ID, non-special-char queries: {remaining:,} ({remaining/total_processed_lines*100:.2f}%)",
            f"Removed userIDs: {len(total_userids) - len(remaining_userids)}",
            f"Remaining UserIDs after processing: {len(remaining_userids)}",
            "="*50,
        ]

        for stat in summary_stats:
            print(stat)
            stats_file.write(stat + "\n")

        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"\nProcessing completed at {datetime.datetime.now()}\n")
        print(f"Elapsed time: {elapsed_time} seconds")
        stats_file.write(
            f"\nProcessing completed at {datetime.datetime.now()}\n")
        stats_file.write(f"Elapsed time: {elapsed_time} seconds")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Preprocess AOL query log files")

    parser.add_argument(
        "input_dir", help="Path to input directory containing AOL query log files")

    args = parser.parse_args()

    preprocess_aol_query_log(args.input_dir)


## Vocabulary Creation ##

In [None]:

def clean_query(query):
    if not isinstance(query, str):
        return None
    return query.strip().lower()

def create_query_vocab(input_dir, vocab_size=None):
    query_counts = Counter()
    all_queries = []

    print("🔁 Starting query-level vocabulary creation...")

    for filename in os.listdir(input_dir):
        if not filename.endswith(".txt"):
            continue

        file_path = os.path.join(input_dir, filename)
        print(f"📄 Processing: {filename}")

        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    parts = line.strip().split('\t')
                    if len(parts) < 2:
                        continue
                    query = clean_query(parts[1])
                    if query:
                        query_counts.update([query])
                        all_queries.append(query)
                except Exception as e:
                    print(f"⚠️ Error processing line: {line} - {e}")
                    continue

    random.seed(42)
    split_index = int(0.8 * len(all_queries))
    train_data = all_queries[:split_index]
    eval_data = all_queries[split_index:]

    with open(TRAIN_FILE_PATH, "w", encoding="utf-8") as train_file:
        train_file.write("\n".join(train_data))
    with open(EVAL_FILE_PATH, "w", encoding="utf-8") as eval_file:
        eval_file.write("\n".join(eval_data))

    print(f"✅ Train set saved to: {TRAIN_FILE_PATH}")
    print(f"✅ Eval set saved to: {EVAL_FILE_PATH}")

    # === Build Vocabulary ===
    most_common_queries = query_counts.most_common(vocab_size) if vocab_size else query_counts.items()
    vocab_dict = {query: idx for idx, (query, _) in enumerate(most_common_queries)}
    actual_vocab_size = len(vocab_dict)
    total_queries = sum(query_counts.values())
    covered = sum(query_counts[q] for q in vocab_dict)
    coverage = (covered / total_queries) * 100 if total_queries > 0 else 0

    vocab_stats = {
        "Requested_Vocabulary_Size": vocab_size if vocab_size else "Full",
        "Actual_Vocabulary_Size": actual_vocab_size,
        "Total_Queries_Found": total_queries,
        "Total_Unique_Queries_Found": len(query_counts),
        "Coverage_Percentage_Of_Top_Queries": round(coverage, 2),
    }

    with open(VOCAB_DICT_PATH, 'w', encoding='utf-8') as f:
        json.dump(vocab_dict, f)
    with open(VOCAB_STATS_PATH, 'w', encoding='utf-8') as f:
        json.dump(vocab_stats, f)

    print("✅ Query-level vocabulary saved to:", VOCAB_DICT_PATH)
    print("📊 Vocabulary Stats:")
    print(json.dumps(vocab_stats, indent=4))

    return vocab_dict, vocab_stats

def get_query_vocabulary(query_file, vocab_size=None):
    from collections import Counter
    word_counts = Counter()

    with open(query_file, "r", encoding="utf-8") as f:
        for line in f:
            query = line.strip().lower()
            if query:
                word_counts.update([query])  # Treat whole query as a token

    most_common = word_counts.most_common(vocab_size) if vocab_size else word_counts.items()
    return {query: idx for idx, (query, _) in enumerate(most_common)}


if __name__ == "__main__":
    create_query_vocab(INPUT_DIR)

## Evaluation of n-gram model

In [None]:
# @title === N-Gram Model MRR & Accuracy Evaluation ===

def query_level_next_prediction(model_path, eval_file, top_k=5, num_examples=10):
    model = kenlm.Model(model_path)

    # Load query stream
    with open(eval_file, 'r', encoding='utf-8') as f:
        queries = [line.strip() for line in f if line.strip()]

    print(f"🔍 Running next-query prediction on {num_examples} examples...\n")

    vocab_dict = get_query_vocabulary(eval_file, vocab_size=None)
    vocabulary = list(vocab_dict.keys())

    mrr_scores = []

    for i in range(len(queries) - 2):
        context = f"{queries[i]} {queries[i+1]}"  # Trigram context: 2 previous queries
        true_query = queries[i+2]

        # Score all candidate queries
        scores = {
            q: model.score(f"{context} {q}", bos=False, eos=False)
            for q in vocabulary
        }

        sorted_preds = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        top_preds = [q for q, _ in sorted_preds[:top_k]]

        # Calculate MRR for this prediction
        if true_query in top_preds:
            rank = top_preds.index(true_query) + 1
            mrr = 1.0 / rank
        else:
            rank = None
            mrr = 0.0
        mrr_scores.append(mrr)

        print(f"🔹 Example {i+1}")
        print(f"Context     : [{queries[i]}] → [{queries[i+1]}]")
        print(f"True query  : {true_query}")
        print(f"Top-{top_k} : {top_preds}")
        print(f"MRR         : {mrr:.4f}")
        print("-" * 40)

        if i + 1 == num_examples:
            break

    avg_mrr = sum(mrr_scores) / len(mrr_scores)
    print(f"\n📊 Average MRR over {len(mrr_scores)} examples: {avg_mrr:.4f}")

if __name__ == "__main__":

    query_level_next_prediction(model_path, eval_file)


## Resource Logger and Measuring Performance

In [None]:

class ResourceLogger:
    def __init__(self, output_path, interval=5):
        self.output_path = os.path.abspath(output_path)
        self.output_dir = os.path.dirname(self.output_path)
        os.makedirs(self.output_dir, exist_ok=True)

        self.interval = interval
        self.stop_event = threading.Event()
        self.logs = []
        self.thread = None
        self.start_time = None

        with open(self.output_path, 'w') as f:
            f.write('[]')

    def _collect_resources(self):
        elapsed_seconds = time.time() - self.start_time

        cpu_percent = psutil.cpu_percent(interval=1)
        memory = psutil.virtual_memory()
        disk = psutil.disk_usage('/')

        gpu_resources = []
        try:
            command = [
                "nvidia-smi",
                "--query-gpu=index,name,utilization.gpu,memory.total,memory.used,memory.free",
                "--format=csv,noheader,nounits"
            ]
            result = subprocess.run(
                command,
                capture_output=True,
                text=True,
                check=True,
                encoding='utf-8'
            )
            gpu_output = result.stdout.strip()
            if gpu_output:
                lines = gpu_output.split('\n')
                for line in lines:
                    if not line:
                        continue
                    idx_str, name, util_str, mem_total_str, mem_used_str, mem_free_str = line.split(',')
                    gpu_resources.append({
                        'gpu_id': int(idx_str.strip()),
                        'gpu_name': name.strip(),
                        'gpu_load': float(util_str.strip()),
                        'gpu_memory_total': int(mem_total_str.strip()),
                        'gpu_memory_used': int(mem_used_str.strip()),
                        'gpu_memory_free': int(mem_free_str.strip())
                    })
        except Exception as e:
            print(f"GPU tracking error: {e}")

        return {
            'elapsed_seconds': elapsed_seconds,
            'cpu_percent': cpu_percent,
            'memory': {
                'total': memory.total,
                'available': memory.available,
                'used': memory.used,
                'percent': memory.percent
            },
            'disk': {
                'total': disk.total,
                'used': disk.used,
                'free': disk.free,
                'percent': disk.percent
            },
            'gpus': gpu_resources
        }

    def _logging_thread(self):
        while not self.stop_event.is_set():
            next_log_time = time.time() + self.interval
            try:
                resource_entry = self._collect_resources()
                self.logs.append(resource_entry)
                print(f"Logged resource entry at elapsed time: {resource_entry['elapsed_seconds']:.2f}s")
            except Exception as e:
                print(f"Error during resource collection: {e}")
            wait_time = max(0, next_log_time - time.time())
            self.stop_event.wait(timeout=wait_time)

    def start(self):
        if self.thread is not None and self.thread.is_alive():
            print("Logger already running.")
            return
        print(f"Starting resource logger (interval: {self.interval}s) at {self.output_path}")
        self.stop_event.clear()
        self.logs = []
        self.start_time = time.time()
        self.thread = threading.Thread(target=self._logging_thread, daemon=True)
        self.thread.start()

    def stop(self):
        if self.thread is None or not self.thread.is_alive():
            print("Logger thread not running.")
            return
        print("Stopping resource logger...")
        self.stop_event.set()
        self.thread.join(timeout=5.0)
        if self.thread.is_alive():
            print("Warning: Logger thread did not stop gracefully.")
        self._save_logs()
        self.thread = None

    def _save_logs(self):
        with open(self.output_path, 'w') as f:
            json.dump(self.logs, f, indent=2)
        print(f"Resource logs saved to {self.output_path} ({len(self.logs)} entries)")


def extract_resource_data(input_file, output_file):
    with open(input_file, 'r') as file:
        data = json.load(file)
    metrics = []
    for entry in data:
        metrics.append({
            'elapsed_minutes': entry["elapsed_seconds"] / 60.0,
            'cpu_percent': entry["cpu_percent"],
            'gpu_load': entry["gpus"][0]["gpu_load"] if entry["gpus"] else 0,
            'disk_percent': entry["disk"]["percent"],
            'memory_percent': entry["memory"]["percent"]
        })
    with open(output_file, 'w', newline='') as csvfile:
        fieldnames = ['elapsed_minutes', 'cpu_percent', 'gpu_load', 'disk_percent', 'memory_percent']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for metric in metrics:
            writer.writerow(metric)


def plot_resource_utilization(data_file, output_name, title):
    df = pd.read_csv(data_file)
    plt.figure(figsize=(8, 5))
    plt.plot(df['elapsed_minutes'], df['cpu_percent'], label='CPU Utilization (%)',
             color='blue', marker='o', linewidth=2)
    plt.plot(df['elapsed_minutes'], df['gpu_load'], label='GPU Utilization (%)',
             color='red', marker='s', linewidth=2)
    plt.plot(df['elapsed_minutes'], df['disk_percent'], label='Disk Utilization (%)',
             color='green', marker='^', linewidth=2)
    plt.xlabel('Elapsed Time (minutes)')
    plt.ylabel('Utilization Percentage (%)')
    plt.title(title)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend()
    plt.ylim(0, 100)
    plt.tight_layout()
    plt.savefig(output_name, format='pdf')
    print(f"Plot saved as {output_name}")
    plt.show()


In [1]:
!pip install https://github.com/kpu/kenlm/archive/master.zip


Collecting https://github.com/kpu/kenlm/archive/master.zip
  Downloading https://github.com/kpu/kenlm/archive/master.zip
[2K     [32m\[0m [32m553.6 kB[0m [31m1.5 MB/s[0m [33m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: kenlm
  Building wheel for kenlm (pyproject.toml) ... [?25l[?25hdone
  Created wheel for kenlm: filename=kenlm-0.2.0-cp311-cp311-linux_x86_64.whl size=3185035 sha256=4c0f5b6d2c88ff6308b3162fa09c20e52fac9856baf64e2214321c26a4104d88
  Stored in directory: /tmp/pip-ephem-wheel-cache-38yqfiig/wheels/4e/ca/6a/e5da175b1396483f6f410cdb4cfe8bc8fa5e12088e91d60413
Successfully built kenlm
Installing collected packages: kenlm
Successfully installed kenlm-0.2.0


In [2]:
!apt-get install -y build-essential cmake libboost-all-dev zlib1g-dev libbz2-dev liblzma-dev
!git clone https://github.com/kpu/kenlm.git
%cd kenlm
!mkdir build && cd build && cmake .. && make -j4


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
build-essential is already the newest version (12.9ubuntu3).
libbz2-dev is already the newest version (1.0.8-5build1).
libbz2-dev set to manually installed.
liblzma-dev is already the newest version (5.2.5-2ubuntu1).
liblzma-dev set to manually installed.
libboost-all-dev is already the newest version (1.74.0.3ubuntu7).
cmake is already the newest version (3.22.1-1ubuntu1.22.04.2).
zlib1g-dev is already the newest version (1:1.2.11.dfsg-2ubuntu9.2).
zlib1g-dev set to manually installed.
0 upgraded, 0 newly installed, 0 to remove and 30 not upgraded.
Cloning into 'kenlm'...
remote: Enumerating objects: 14185, done.[K
remote: Counting objects: 100% (136/136), done.[K
remote: Compressing objects: 100% (75/75), done.[K
remote: Total 14185 (delta 92), reused 64 (delta 61), pack-reused 14049 (from 4)[K
Receiving objects: 100% (14185/14185), 5.98 MiB | 9.43 MiB/s, done.
Resolving deltas: 100% 