In [None]:
import json

def load_ndjson(path):
    data = []
    with open(path, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                data.append(json.loads(line))
    return data

# Load all splits
train_1 = load_ndjson("train00.json")
train_2 = load_ndjson("train01.json")
train_3 = load_ndjson("train02.json")
train_4 = load_ndjson("train03.json")
dev_data = load_ndjson("valid.json")
test_data = load_ndjson("test.json")

# Merge train splits
train_data = train_1 + train_2 + train_3 + train_4
print(f"Train sentences: {len(train_data)}  | Dev: {len(dev_data)} | Test: {len(test_data)}")

Train sentences: 59924  | Dev: 8528 | Test: 8262


In [None]:
# Optional: quick alignment check
def check_alignment(data, name):
    bad = sum(1 for e in data if len(e["tokens"]) != len(e["tags"]))
    print(f"{name}: {bad} misaligned sentences")

check_alignment(train_data, "train")
check_alignment(dev_data,   "dev")
check_alignment(test_data,  "test")


train: 0 misaligned sentences
dev: 0 misaligned sentences
test: 0 misaligned sentences


# Step 2: Load Tag Dictionary and Convert Sentences


In [None]:
# Load the label dictionary (tag string -> index) and build inverse mapping
with open("label.json", encoding="utf-8") as f:
    tag2idx = json.load(f)

idx2tag = {i: t for t, i in tag2idx.items()}

# Convert dataset entries into (word, tag_str) pairs per sentence
def to_pairs(data):
    sents = []
    for e in data:
        tokens = e["tokens"]
        tags_i = e["tags"]
        tags_s = [idx2tag[int(t)] for t in tags_i]
        sents.append(list(zip(tokens, tags_s)))
    return sents

train_sents = to_pairs(train_data)
dev_sents   = to_pairs(dev_data)
test_sents  = to_pairs(test_data)
print("Sample sentence:", train_sents[0])


✅ Sample sentence: [('John', 'B-PERSON'), ('lives', 'O'), ('in', 'O'), ('New', 'B-GPE'), ('York', 'I-GPE')]


#  Step 3: Feature Extraction Function


In [None]:
import re
import string

UP = set(string.ascii_uppercase)
DIG = set(string.digits)

def shape(word):
    # Xxdd pattern-like shape
    s = []
    for ch in word:
        if ch in UP:
            s.append('X')
        elif ch in DIG:
            s.append('d')
        elif ch in string.ascii_lowercase:
            s.append('x')
        else:
            s.append(ch)
    return ''.join(s)

def extract_features(sent, i, prev_tag):
    w = sent[i][0]
    w_low = w.lower()
    prev_w = sent[i-1][0].lower() if i > 0 else "<START>"
    next_w = sent[i+1][0].lower() if i < len(sent)-1 else "<END>"

    feats = {
        "bias": 1.0,
        "w": w_low,
        "pw": prev_w,
        "nw": next_w,
        "prev_tag": prev_tag,
        "is_title": w.istitle(),
        "is_upper": w.isupper(),
        "is_lower": w.islower(),
        "has_digit": any(ch.isdigit() for ch in w),
        "is_punct": all(ch in string.punctuation for ch in w),
        "shape": shape(w),
        "pref1": w_low[:1],
        "pref2": w_low[:2],
        "pref3": w_low[:3],
        "suf1": w_low[-1:],
        "suf2": w_low[-2:],
        "suf3": w_low[-3:],
    }
    return feats

# preparing the training data

In [None]:
X_train_feats = []
y_train_labels = []

for sent in train_sents:
    prev_tag = "O"
    for i in range(len(sent)):
        feats = extract_features(sent, i, prev_tag)
        tag = sent[i][1]
        X_train_feats.append(feats)
        y_train_labels.append(tag)
        prev_tag = tag

print("✅ Sample features:", X_train_feats[0])
print("✅ Sample label:", y_train_labels[0])

✅ Sample features: {'bias': 1.0, 'w': 'john', 'pw': '<START>', 'nw': 'lives', 'prev_tag': 'O', 'is_title': True, 'is_upper': False, 'is_lower': False, 'has_digit': False, 'is_punct': False, 'shape': 'Xxxx', 'pref1': 'j', 'pref2': 'jo', 'pref3': 'joh', 'suf1': 'n', 'suf2': 'hn', 'suf3': 'ohn'}
✅ Sample label: B-PERSON


# Step 5: Train the MEMM (Logistic Regression

In [None]:

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

memm = Pipeline([
    ("vec", DictVectorizer(sparse=True)),
    ("clf", LogisticRegression(max_iter=300, n_jobs=-1))
])

memm.fit(X_train_feats, y_train_labels)
print("✅ MEMM training complete.")

✅ MEMM training complete.


# Predict Tags for a Sentence


In [None]:
def memm_predict(sent):
    tags = []
    prev_tag = "O"
    for i in range(len(sent)):
        feats = extract_features(sent, i, prev_tag)
        pred = memm.predict([feats])[0]
        tags.append(pred)
        prev_tag = pred
    return tags

# evalute on the validation set


In [None]:
from sklearn.metrics import classification_report

y_true = []
y_pred = []

for sent in dev_sents:
    true_tags = [t for _, t in sent]
    pred_tags = memm_predict(sent)
    y_true.extend(true_tags)
    y_pred.extend(pred_tags)

print("✅ valid set evaluation:")
print(classification_report(y_true, y_pred, digits=3))

✅ valid set evaluation:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


               precision    recall  f1-score   support

   B-CARDINAL      0.000     0.000     0.000       937
       B-DATE      0.000     0.000     0.000      1507
      B-EVENT      0.000     0.000     0.000       143
        B-FAC      0.000     0.000     0.000       115
        B-GPE      0.207     0.025     0.045      2268
   B-LANGUAGE      0.000     0.000     0.000        33
        B-LAW      0.000     0.000     0.000        40
        B-LOC      0.000     0.000     0.000       204
      B-MONEY      0.000     0.000     0.000       271
       B-NORP      0.000     0.000     0.000       847
    B-ORDINAL      0.000     0.000     0.000       232
        B-ORG      0.444     0.002     0.005      1740
    B-PERCENT      0.000     0.000     0.000       177
     B-PERSON      0.108     0.303     0.160      2020
    B-PRODUCT      0.000     0.000     0.000        72
   B-QUANTITY      0.000     0.000     0.000       100
       B-TIME      0.000     0.000     0.000       214
B-WORK_OF

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# find the best prameter for the model

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    ("vec", DictVectorizer(sparse=True)),
    ("clf", LogisticRegression(max_iter=300, solver="liblinear"))
])

param_grid = {
    "clf__C": [0.1, 0.5, 1.0, 2.0, 5.0],
    "clf__penalty": ["l2"],
    "clf__class_weight": [None, "balanced"]
}

grid = GridSearchCV(pipeline, param_grid, cv=3, verbose=1, n_jobs=-1)
grid.fit(X_train_feats, y_train_labels)

print("✅ Best parameters:", grid.best_params_)


Fitting 3 folds for each of 10 candidates, totalling 30 fits
✅ Best parameters: {'clf__C': 0.1, 'clf__class_weight': 'balanced', 'clf__penalty': 'l2'}




In [None]:


X_dev_feats = []
y_dev_labels = []

for sent in dev_sents:
    prev_tag = "O"
    for i in range(len(sent)):
        feats = extract_features(sent, i, prev_tag)
        tag = sent[i][1]
        X_dev_feats.append(feats)
        y_dev_labels.append(tag)
        prev_tag = tag

# we prepared the valid set and retrain the new model with new parameter on train and valid

In [None]:
X_all_feats = X_train_feats + X_dev_feats
y_all_labels = y_train_labels + y_dev_labels

memm.fit(X_all_feats, y_all_labels)
print(" Retrained MEMM on train + valid.")

✅ Retrained MEMM on train + valid.


In [None]:
from sklearn.metrics import classification_report

def memm_predict(sent, model):
    tags = []
    prev_tag = "O"
    for i in range(len(sent)):
        feats = extract_features(sent, i, prev_tag)
        pred = model.predict([feats])[0]
        tags.append(pred)
        prev_tag = pred
    return tags

y_true = []
y_pred = []

for sent in test_sents:
    true_tags = [t for _, t in sent]
    pred_tags = memm_predict(sent, memm)
    y_true.extend(true_tags)
    y_pred.extend(pred_tags)

print("Final test evaluation:")
print(classification_report(y_true, y_pred, digits=3))

Final test evaluation:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


               precision    recall  f1-score   support

   B-CARDINAL      0.593     0.686     0.636       935
       B-DATE      0.710     0.679     0.694      1602
      B-EVENT      0.222     0.032     0.056        63
        B-FAC      0.667     0.059     0.109       135
        B-GPE      0.786     0.861     0.822      2240
   B-LANGUAGE      0.625     0.227     0.333        22
        B-LAW      0.667     0.050     0.093        40
        B-LOC      0.559     0.318     0.406       179
      B-MONEY      0.623     0.543     0.580       313
       B-NORP      0.740     0.835     0.784       841
    B-ORDINAL      0.574     0.815     0.674       195
        B-ORG      0.582     0.456     0.512      1795
    B-PERCENT      0.800     0.711     0.753       349
     B-PERSON      0.707     0.718     0.713      1988
    B-PRODUCT      0.583     0.184     0.280        76
   B-QUANTITY      0.320     0.152     0.206       105
       B-TIME      0.661     0.340     0.449       212
B-WORK_OF

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
