## Named Entity Recognition with Conditional Random Fields

#### Install Required Libraries

In [82]:
!pip install -q spacy
!python -m spacy download ru_core_news_sm
!pip install -q sklearn-crfsuite

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting ru-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.7.0/ru_core_news_sm-3.7.0-py3-none-any.whl (15.3 MB)
                                              0.0/15.3 MB ? eta -:--:--
                                              0.0/15.3 MB ? eta -:--:--
                                             0.0/15.3 MB 320.0 kB/s eta 0:00:48
                                             0.0/15.3 MB 262.6 kB/s eta 0:00:58
                                             0.0/15.3 MB 262.6 kB/s eta 0:00:58
                                             0.1/15.3 MB 252.2 kB/s eta 0:01:01
                                             0.1/15.3 MB 327.7 kB/s eta 0:00:47
                                             0.1/15.3 MB 344.8 kB/s eta 0:00:44
                                             0.1/15.3 MB 343.4 kB/s eta 0:00:45
                                            

#### Load Libraries and SpaCy Model

In [83]:
import json
import zipfile
import sklearn_crfsuite
import spacy

In [84]:
# Load Russian spaCy model
nlp = spacy.load('ru_core_news_sm')

#### Declare Functions

In [85]:
def load_data(file_path):
    """ 
    Load JSONL data from a file.
    
    Parameters:
    file_path (str): The path to the file to load.
    
    Returns:
    list: A list of dictionaries containing the data.
    """
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line))
    return data


In [86]:
def process_training_data(data):
    """ 
    Process training data with entity labels, preparing for feature extraction. 
    
    Parameters:
    data (list): A list of dictionaries containing the data.
    
    Returns:
    list: A list of tuples containing the tokens and labels for each document.
    """
    processed_data = []
    
    # Process each document in the data
    for item in data:
        doc = nlp(item['sentences'])
        labels = ['O'] * len(doc)
        
        # Set labels for tokens that are part of named entities
        for start, end, label in item.get('ners', []):
            for token in doc:
                if token.idx >= start and token.idx + len(token.text) <= end:
                    labels[token.i] = label
        tokens_labels = [(token, labels[token.i]) for token in doc]
        processed_data.append(tokens_labels)
    return processed_data


In [87]:
def process_test_data(data):
    """ 
    Process test data, preparing for feature extraction without labels.
    
    Parameters:
    data (list): A list of dictionaries containing the data.
    
    Returns:
    list: A list of tuples containing the tokens and labels for each document.
    """
    processed_data = []
    for item in data:
        doc = nlp(item['senences'])  # Typo in the key name
        document_id = item['id']
        tokens_labels = [(token, 'O', document_id) for token in doc]
        processed_data.append(tokens_labels)
    return processed_data


In [88]:
def extract_features(doc):
    """ 
    Extract features from each token in a document. 
    
    Parameters:
    doc (list): A list of tokens in the document.
    
    Returns:
    list: A list of dictionaries containing the features for each token.
    """
    return [word2features(token, i) for i, token in enumerate(doc)]


In [89]:
def word2features(token, i):
    """ 
    Generate features from a token. 
    
    Parameters:
    token (spacy.Token): The token to generate features for.
    
    Returns:
    dict: A dictionary containing the features for the token.
    """
    # Features for the token
    features = {
        'bias': 1.0,
        'word.lower()': token.text.lower(),
        'word[-3:]': token.text[-3:],
        'word[-2:]': token.text[-2:],
        'word.isupper()': token.text.isupper(),
        'word.istitle()': token.text.istitle(),
        'word.isdigit()': token.text.isdigit(),
        'BOS': i == 0,
        'EOS': i == len(token.doc) - 1,
    }
    if i > 0:
        features.update({
            '-1:word.lower()': token.doc[i-1].text.lower(),
            '-1:word.istitle()': token.doc[i-1].text.istitle(),
            '-1:word.isupper()': token.doc[i-1].text.isupper(),
        })
    if i < len(token.doc) - 1:
        features.update({
            '+1:word.lower()': token.doc[i+1].text.lower(),
            '+1:word.istitle()': token.doc[i+1].text.istitle(),
            '+1:word.isupper': token.doc[i+1].text.isupper(),
        })
    return features


In [90]:
def save_predictions_to_jsonl(data, predictions, filename='test.jsonl'):
    """ 
    Save predictions in JSONL format suitable for submission. 
    
    Parameters:
    data (list): A list of tuples containing the tokens and labels for each document.
    predictions (list): A list of predicted labels for each document.
    filename (str): The name of the file to save the predictions to.
    
    Returns:
    None
    """
    results = []
    # Combine the data and predictions
    for data_entry, pred_labels in zip(data, predictions):
        document_id = data_entry[0][2]
        result = {'id': document_id, 'ners': []}
        # Extract the named entities from the predictions
        for (token, _, doc_id), label in zip(data_entry, pred_labels):
            if label != 'O':
                start_idx = token.idx
                end_idx = start_idx + len(token.text) - 1
                result['ners'].append([start_idx, end_idx, label])
        results.append(result)
    
    # Save the results to a JSONL file
    with open(filename, 'w', encoding='utf-8') as f:
        for line in results:
            json.dump(line, f, ensure_ascii=False)
            f.write('\n')


In [91]:
def zip_file(input_filename, output_filename='test.zip'):
    """ 
    Zip the specified file.
    
    Parameters:
    input_filename (str): The name of the file to zip.
    output_filename (str): The name of the output zip file.
    
    Returns:
    None
    """
    with zipfile.ZipFile(output_filename, 'w', zipfile.ZIP_DEFLATED) as zf:
        zf.write(input_filename)


#### Train and Predict with CRF

In [92]:
# Load and process training data
train_data = process_training_data(load_data('../data/train.jsonl'))
X_train = [extract_features([t[0] for t in doc]) for doc in train_data]
y_train = [[t[1] for t in doc] for doc in train_data]

In [93]:
# Train CRF model
crf = sklearn_crfsuite.CRF(algorithm='l2sgd')
try:
    crf.fit(X_train, y_train)
except AttributeError:
    pass

In [94]:
# Load and process test data
test_data = process_test_data(load_data('../data/test.jsonl'))
X_test = [extract_features([t[0] for t in doc]) for doc in test_data]

In [95]:
# Make predictions
y_pred = crf.predict(X_test)

In [96]:
# Save predictions to JSONL file
save_predictions_to_jsonl(test_data, y_pred)
zip_file('test.jsonl')