In [None]:
!python3 -m pip install pymongo[srv]
!pip3 install dnspython

Collecting dnspython
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading dnspython-2.7.0-py3-none-any.whl (313 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.6/313.6 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython
Successfully installed dnspython-2.7.0


In [None]:
pip install sklearn_crfsuite

Collecting sklearn_crfsuite
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn_crfsuite)
  Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite, sklearn_crfsuite
Successfully installed python-crfsuite-0.9.11 sklearn_crfsuite-0.5.0


In [None]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn_crfsuite import CRF
from sklearn.model_selection import GridSearchCV

import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import PorterStemmer

# Download required NLTK data
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt')



In [None]:
# Convert string representations of lists to actual lists
data = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
df = data.copy()

df['Sentence'] = df['Sentence'].apply(literal_eval)
df['NER Tag'] = df['NER Tag'].apply(literal_eval)

test['Sentence'] = test['Sentence'].apply(literal_eval)

# Get all unique tags
tags = set(tag for tags in df['NER Tag'] for tag in tags)
print("Unique tags:", tags)

Unique tags: {'B-org', 'I-per', 'I-tim', 'I-eve', 'B-geo', 'B-gpe', 'I-org', 'I-geo', 'I-art', 'B-art', 'B-eve', 'I-gpe', 'I-nat', 'B-nat', 'B-per', 'O', 'B-tim'}


In [None]:

# Create CRF model

# Training takes time, wait for a few minutes
def word2features(sent, i):
    word = sent[i]
    pos_tags = nltk.pos_tag(sent)
    word_pos = pos_tags[i][1]
    stem = PorterStemmer().stem(word.lower())

    features = {
        # Word features
        'word': word.lower(),
        'stem': stem,
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'word.length': len(word),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word[:3]': word[:3],
        'word[:2]': word[:2],

        # POS features
        'pos': word_pos,
        'pos[:2]': word_pos[:2],

        # Shape features
        'word.shape': ''.join(['X' if c.isupper() else 'x' if c.islower()
                             else 'd' if c.isdigit() else c for c in word]),

        # Context features
        'pos-1': pos_tags[i-1][1] if i > 0 else '<START>',
        'pos+1': pos_tags[i+1][1] if i < len(sent)-1 else '<END>',
        'word-1': sent[i-1].lower() if i > 0 else '<START>',
        'word+1': sent[i+1].lower() if i < len(sent)-1 else '<END>',
    }

    # Add prefix/suffix patterns
    for n in range(1, 4):
        if len(word) >= n:
            features[f'prefix_{n}'] = word[:n]
            features[f'suffix_{n}'] = word[-n:]



    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

X_train = [sent2features(s) for s in df['Sentence']]
y_train = df['NER Tag']




In [None]:
# Define parameter space


params = {
    'algorithm': ['lbfgs'],
    'c1': [ 0.1, 1.0],
    'c2': [ 0.1, 1.0],
    'max_iterations': [100],
    'all_possible_transitions': [True]
}

crf = CRF()
gs = GridSearchCV(crf, params, cv=2, verbose=1, n_jobs=-1)
gs.fit(X_train, y_train)  # Fit first
print("Best parameters:", gs.best_params_)  # Then print best params

Fitting 2 folds for each of 4 candidates, totalling 8 fits
Best parameters: {'algorithm': 'lbfgs', 'all_possible_transitions': True, 'c1': 0.1, 'c2': 1.0, 'max_iterations': 100}


In [None]:
import joblib

# Get the best model from GridSearch
best_crf = gs.best_estimator_

# Save the model to a file
model_filename = 'crf_model.joblib'
joblib.dump(best_crf, model_filename)
print(f"Model saved to {model_filename}")

Model saved to crf_model.joblib


In [None]:
import joblib

# Load the saved model
loaded_model = joblib.load('crf_model.joblib')


X_test = [sent2features(s) for s in test['Sentence']]

# Make predictions
y_pred = loaded_model.predict(X_test)

# Create submission file
submission = pd.DataFrame({
    'id': test['id'],
    'NER Tag': [str(pred) for pred in y_pred]
})
submission.to_csv('submission.csv', index=False)