In [9]:
# 1.1
# Load and inspect datasets

import re
import json
from collections import Counter
from pathlib import Path
import pandas as pd

# Paths (uploaded files)
sherlock_path = Path("sherlock_dataset1.txt")
linux_path = Path("linux_dataset2.txt")

# Read files
sherlock_text = sherlock_path.read_text(encoding="utf-8", errors="ignore")
linux_text = linux_path.read_text(encoding="utf-8", errors="ignore")

print(f"Sherlock dataset length: {len(sherlock_text):,} characters")
print(f"Linux dataset length: {len(linux_text):,} characters")


Sherlock dataset length: 581,423 characters
Linux dataset length: 6,206,995 characters


In [3]:
# Preprocessing functions for natural (Category I) and code (Category II) datasets

def preprocess_natural(text: str):
    text = re.sub(r"[^a-zA-Z0-9 \.]", " ", text)   # keep alphanumeric, space, dot
    text = re.sub(r"\s+", " ", text)
    text = text.lower()
    text = text.replace(".", " . ")
    tokens = re.sub(r"\s+", " ", text).strip().split(" ")
    return [t for t in tokens if t != ""]

def preprocess_code(text: str):
    tokens = []
    for line in text.splitlines():
        parts = line.strip().split()
        if parts:
            tokens.extend(parts)
        tokens.append("<NL>")
    if tokens and tokens[-1] == "<NL>":
        tokens.pop()
    return tokens

# Apply preprocessing
sherlock_tokens = preprocess_natural(sherlock_text)
linux_tokens = preprocess_code(linux_text)

print(f"Sherlock tokens: {len(sherlock_tokens):,}")
print(f"Linux tokens: {len(linux_tokens):,}")


Sherlock tokens: 115,579
Linux tokens: 1,001,103


In [4]:
from collections import Counter

def build_vocab(tokens):
    counts = Counter(tokens)
    vocab = {tok: i for i, tok in enumerate(sorted(counts.keys()))}
    inv_vocab = {i: tok for tok, i in vocab.items()}
    return vocab, inv_vocab, counts

s_vocab, s_inv_vocab, s_counts = build_vocab(sherlock_tokens)
l_vocab, l_inv_vocab, l_counts = build_vocab(linux_tokens)

print("Sherlock vocab size:", len(s_vocab))
print("Linux vocab size:", len(l_vocab))


Sherlock vocab size: 8151
Linux vocab size: 113645


In [5]:
def top_bottom_counts(counts, k=10):
    most_common = counts.most_common(k)
    least_common = sorted(counts.items(), key=lambda x: (x[1], x[0]))[:k]
    return most_common, least_common

s_most, s_least = top_bottom_counts(s_counts)
l_most, l_least = top_bottom_counts(l_counts)

print("Sherlock Datset Most Common Tokens:")
print(pd.DataFrame(s_most, columns=["token", "count"]).head(10))
print("Sherlock Datset Least Common Tokens:")
print(pd.DataFrame(s_least, columns=["token", "count"]).head(10))
print("Linux Datset Most Common Tokens:")
print(pd.DataFrame(l_most, columns=["token", "count"]).head(10))
print("Linux Datset Least Common Tokens:")
print(pd.DataFrame(l_least, columns=["token", "count"]).head(10))

Sherlock Datset Most Common Tokens:
  token  count
0     .   6431
1   the   5822
2   and   3085
3     i   3038
4    to   2826
5    of   2781
6     a   2700
7    in   1826
8  that   1767
9    it   1749
Sherlock Datset Least Common Tokens:
  token  count
0  10th      1
1    12      1
2  12th      1
3   140      1
4   150      1
5  1500      1
6  1661      1
7   16a      1
8    17      1
9  1846      1
Linux Datset Most Common Tokens:
    token   count
0    <NL>  241464
1       *   33504
2       =   28003
3       {   18915
4      if   17702
5       }   16965
6     the   16080
7      */   13445
8      /*   12190
9  struct   10997
Linux Datset Least Common Tokens:
                          token  count
0          !!(attr->sched_flags      1
1             !!(caps.magic_etc      1
2                !!(file->flags      1
3                      !!(flags      1
4             !!(func_flags.val      1
5                !!(iter->flags      1
6  !!(kprobe_gone(&tk->rp.kp));      1
7          !!(me->mm

In [6]:
def make_xy(tokens, vocab, context_len=5):
    X, y = [], []
    for i in range(len(tokens) - context_len):
        X.append([vocab[t] for t in tokens[i:i+context_len]])
        y.append(vocab[tokens[i+context_len]])
    return X, y

CONTEXT_LEN = 5
sX, sy = make_xy(sherlock_tokens, s_vocab, CONTEXT_LEN)
lX, ly = make_xy(linux_tokens, l_vocab, CONTEXT_LEN)

print("Sherlock training pairs:", len(sX))
print("Linux training pairs:", len(lX))


Sherlock training pairs: 115574
Linux training pairs: 1001098


In [7]:
import pandas as pd

def preview_pairs(inv_vocab, X, y, n=5):
    examples = []
    for i in range(n):
        ctx = " ".join([inv_vocab[idx] for idx in X[i]])
        tgt = inv_vocab[y[i]]
        examples.append({"context": ctx, "target": tgt})
    return pd.DataFrame(examples)

print("Sherlock Sample Pairs:")
display(preview_pairs(s_inv_vocab, sX, sy, n=10))

print("Linux Sample Pairs:")
display(preview_pairs(l_inv_vocab, lX, ly, n=5))


Sherlock Sample Pairs:


Unnamed: 0,context,target
0,the project gutenberg ebook of,the
1,project gutenberg ebook of the,adventures
2,gutenberg ebook of the adventures,of
3,ebook of the adventures of,sherlock
4,of the adventures of sherlock,holmes
5,the adventures of sherlock holmes,by
6,adventures of sherlock holmes by,arthur
7,of sherlock holmes by arthur,conan
8,sherlock holmes by arthur conan,doyle
9,holmes by arthur conan doyle,this


Linux Sample Pairs:


Unnamed: 0,context,target
0,/* <NL> * linux/kernel/irq/autoprobe.c <NL>,*
1,<NL> * linux/kernel/irq/autoprobe.c <NL> *,<NL>
2,* linux/kernel/irq/autoprobe.c <NL> * <NL>,*
3,linux/kernel/irq/autoprobe.c <NL> * <NL> *,Copyright
4,<NL> * <NL> * Copyright,(C)


In [10]:
import json

def save_json(obj, path):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f)

# Save everything
save_json(sherlock_tokens, "sherlock_tokens.json")
save_json(s_vocab, "sherlock_vocab.json")
save_json(s_inv_vocab, "sherlock_inv_vocab.json")
save_json(sX, "sherlock_X.json")
save_json(sy, "sherlock_y.json")

save_json(linux_tokens, "linux_tokens.json")
save_json(l_vocab, "linux_vocab.json")
save_json(l_inv_vocab, "linux_inv_vocab.json")
save_json(lX, "linux_X.json")
save_json(ly, "linux_y.json")

print("Saved preprocessed data and vocab files locally.")

Saved preprocessed data and vocab files locally.
