In [1]:
!pip install numpy gensim scikit-learn



In [2]:
def transform_dataset(page_dataset, for_inference):
    labeled_text_dataset = []
    for page in page_dataset:
        page_words = page["representativeData"]["page_data_words"]
        
        geo_dictionary = {}
        if not for_inference:
            page_answers = page.get("answers")
            for page_answer in page_answers[0]["answer"]:
                geo_label = page_answer["id"]
                for geo_part in page_answer["data"]:
                    for index in range(geo_part["start"], geo_part["end"]):
                        geo_dictionary[index] = geo_label
        
        labeled_text = []
        for word_index, word in enumerate(page_words):
            word_label = "0" if for_inference else geo_dictionary.get(word_index, "O")
            labeled_text.append((word, word_label))
        
        if not for_inference:
            labeled_text_dataset.append(labeled_text)
        else:
            labeled_text_dataset.append((page["taskId"], labeled_text))
    
    return labeled_text_dataset

In [3]:
import json

def get_labeled_dataset(dataset_path, for_inference=False):
    with open(dataset_path) as json_dataset:
        dataset = json.load(json_dataset)
        
    labeled_dataset = transform_dataset(dataset["data"]["results"], for_inference)
    return labeled_dataset

In [4]:
def get_validation_result(X_validation, y_pred):
    validation_result = []
    
    for ((task_id, labeled_text), predictions) in zip(X_validation, y_pred):
        answers = {}
        current_label = None
        start_index = None
        
        for current_index, label in enumerate(predictions):
            if label == current_label:
                continue
            else:
                if current_label is not None and current_label != "O":
                    if current_label not in answers:
                        answers[current_label] = []
                    answers[current_label].append({"start": start_index, "end": current_index})
                
                if label != "0":
                    current_label = label
                    start_index = current_index
                else:
                    current_label = None
    
        if current_label is not None and current_label != "O":
            if current_label not in answers:
                answers[current_label] = []
            answers[current_label].append({"start": start_index, "end": len(predictions)})
        
        validation_answers = []
        for label, segments in answers.items():
            validation_answers.append({"id": label, "data": segments})
        
        validation_result.append({
            "taskId": task_id,
            "answer": validation_answers
        })
        
    return validation_result

In [5]:
train_dataset = get_labeled_dataset("datasets/train_geo_extractor.json")

In [6]:
import os
from gensim.models import FastText

model = FastText(sentences=[word for text in train_dataset for word, _ in text], vector_size=30, window=3,
                 min_count=1, workers=os.cpu_count(), sg=1)

In [7]:
import numpy as np

X_train = []
y_train = []

for text in train_dataset:
    for word, label in text:
        word_vector = model.wv[word]
        X_train.append(word_vector)
        y_train.append(label)
    
X_train = np.array(X_train)

In [8]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)

In [9]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_jobs=-1)
rfc.fit(X_train, y_train)

RandomForestClassifier(n_jobs=-1)

In [10]:
test_dataset = get_labeled_dataset("datasets/test_geo_extractor.json")

In [11]:
X_test = []
y_test = []

for text in test_dataset:
    for word, label in text:
        word_vector = model.wv[word]
        X_test.append(word_vector)
        y_test.append(label)
    
X_test = np.array(X_test)

In [12]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, matthews_corrcoef

y_pred = rfc.predict(X_test)
y_pred = encoder.inverse_transform(y_pred)

print(classification_report(y_test, y_pred))
print(f"Matthews Correlation Coefficient: {matthews_corrcoef(y_test, y_pred)}")

                   precision    recall  f1-score   support

                O       0.97      0.99      0.98     62822
     central_city       0.42      0.26      0.32       184
      geo_address       0.55      0.14      0.23      1040
     geo_building       0.76      0.47      0.58       453
         geo_city       0.81      0.64      0.71      1433
     geo_district       0.88      0.67      0.76       387
geo_microdistrict       0.63      0.44      0.52       382
       geo_region       0.98      0.99      0.98      1733
geo_region_oblast       0.70      0.76      0.73       297
       geo_street       0.57      0.50      0.53      1059

         accuracy                           0.95     69790
        macro avg       0.73      0.59      0.63     69790
     weighted avg       0.95      0.95      0.95     69790

Matthews Correlation Coefficient: 0.7375178034790815


In [13]:
validation_dataset = get_labeled_dataset("datasets/val_no_answer_geo_extractor.json", for_inference=True)

X_validation = [(task_id, [model.wv[word] for word, _ in text]) for task_id, text in validation_dataset]

X_validation_vectors = [word_vector for _, word_vectors in X_validation for word_vector in word_vectors]
X_validation_vectors = np.array(X_validation_vectors)

y_pred = rfc.predict(X_validation_vectors)
y_pred = encoder.inverse_transform(y_pred)

In [14]:
lengths = [len(text) for _, text in X_validation]

start = 0

y_pred_validation = []

for length in lengths:
    end = start + length
    y_pred_validation.append(y_pred[start:end])
    start = end

In [15]:
validation_result = get_validation_result(X_validation, y_pred_validation)

with open("rfc_validation_result.json", "w", encoding="utf-8") as file:
    json.dump(validation_result, file, ensure_ascii=False, indent=4)

print("Validation result has been saved!")

Validation result has been saved!
