In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import tqdm

import re
import json

In [5]:
def get_filename(filepath):
    m = re.search(r'/([a-z_]*)\.csv', filepath)
    return m.group(1)

def get_classifier_score(file_path = "../deeppavlov_ner_3.6/data/ner_overlap_result.csv",
                         model = lambda: KNeighborsClassifier(3),
                         x_columns = ["tfidf_distance", "laser_distance", "use_distance", "fasttext_distance", "bert_distance", "ner_custom_overlap", "ner_overlap"],
                         y_column = "score"):
    result_data =  pd.read_csv(file_path)
    result_x_columns = []
    result_scores = []

    for i in tqdm.tqdm(range(len(x_columns))):
        for j in range(i+1, len(x_columns)+1):

            if i == j:
                current_x_columns = [x_columns[i]]
            else:
                current_x_columns = x_columns[i:j]

            x = np.asarray(result_data[current_x_columns]).reshape(-1, len(current_x_columns))
            y = np.asarray(result_data[y_column]).reshape(-1, 1)

            X_train, X_test, y_train, y_test = \
                    train_test_split(x, y, test_size=.7, random_state=42)

            classifier = model()
            classifier.fit(X_train, y_train)

            result_scores.append(classifier.score(X_test, y_test))

            result_x_columns.append(current_x_columns)

    return result_x_columns, result_scores

def train(file_path="./data/result.csv", classifier = KNeighborsClassifier(3)):
    result_data =  pd.read_csv(file_path)

    x = np.asarray(result_data["distance"].tolist()).reshape(-1, 1)
    y = np.asarray(result_data["score"].tolist()).reshape(-1, 1)

    classifier.fit(x, y)
    return classifier

def run_for_all(output="/home/droman/Documents/diploma/deeppavlov_ner_3.6/data/output/classification/scores.json"):
    json_to_save = {"items": []}
    for file in ["/home/droman/Documents/diploma/deeppavlov_ner_3.6/data/output/normalized/data_origin.csv",
                 "/home/droman/Documents/diploma/deeppavlov_ner_3.6/data/output/normalized/data_preprocess.csv",
                 "/home/droman/Documents/diploma/deeppavlov_ner_3.6/data/output/normalized/data_semi_preprocc_with_stopwords.csv",
                 "/home/droman/Documents/diploma/deeppavlov_ner_3.6/data/output/normalized/data_semi_preprocess_without_stopwords.csv"]:
        result_x_columns, result_scores = get_classifier_score(file_path=file)
        stored_structure = {"data_type": get_filename(file), "x_columns": result_x_columns, "scores": result_scores}
        json_to_save["items"].append(stored_structure)
    with open(output, 'w') as outfile:
        json.dump(json_to_save, outfile)


In [6]:
output_dir="/home/droman/Documents/diploma/deeppavlov_ner_3.6/data/output/classification/scores.json"
run_for_all(output=output_dir)

100%|██████████| 7/7 [00:00<00:00, 31.05it/s]
100%|██████████| 7/7 [00:00<00:00, 38.43it/s]
100%|██████████| 7/7 [00:00<00:00, 38.64it/s]
100%|██████████| 7/7 [00:00<00:00, 37.93it/s]
