In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import tqdm

import re
import json

In [26]:
def get_filename(filepath):
    m = re.search(r'/([a-z_]*)\.csv', filepath)
    return m.group(1)

def get_classifier_score(result_data,
                         model = lambda: KNeighborsClassifier(3),
                         x_columns = ["tfidf_distance", "laser_distance", "use_distance", "fasttext_distance", "bert_distance", "ner_custom_overlap", "ner_overlap"],
                         y_column = "score"):
    result_x_columns = []
    result_scores = []

    for i in tqdm.tqdm(range(len(x_columns))):

        for j in range(i+1, len(x_columns)):
            if i == j:
                continue
            else:
                current_x_columns = [x_columns[i], x_columns[j]]

            calculate_scores(result_data, current_x_columns, y_column, model, result_scores, result_x_columns)

        for j in range(i+1, len(x_columns)+1):

            if i == j:
                current_x_columns = [x_columns[i]]
            else:
                current_x_columns = x_columns[i:j]

            calculate_scores(result_data, current_x_columns, y_column, model, result_scores, result_x_columns)


    return result_x_columns, result_scores

def calculate_scores(result_data, current_x_columns, y_column, model, result_scores, result_x_columns):
    if current_x_columns in result_x_columns:
        return

    x = np.asarray(result_data[current_x_columns]).reshape(-1, len(current_x_columns))
    y = np.asarray(result_data[y_column]).reshape(-1, 1)

    X_train, X_test, y_train, y_test = \
            train_test_split(x, y, test_size=.7, random_state=42)

    classifier = model()
    classifier.fit(X_train, y_train)

    result_scores.append(classifier.score(X_test, y_test))

    result_x_columns.append(current_x_columns)

def train(file_path="./data/result.csv", classifier = KNeighborsClassifier(3)):
    result_data =  pd.read_csv(file_path)

    x = np.asarray(result_data["distance"].tolist()).reshape(-1, 1)
    y = np.asarray(result_data["score"].tolist()).reshape(-1, 1)

    classifier.fit(x, y)
    return classifier

def run_for_all_by_files(output="/home/droman/Documents/diploma/deeppavlov_ner_3.6/data/output/classification/scores.json"):
    json_to_save = {"items": []}
    for file_path in ["/home/droman/Documents/diploma/deeppavlov_ner_3.6/data/output/normalized/data_origin.csv",
                 "/home/droman/Documents/diploma/deeppavlov_ner_3.6/data/output/normalized/data_preprocess.csv",
                 "/home/droman/Documents/diploma/deeppavlov_ner_3.6/data/output/normalized/data_semi_preprocc_with_stopwords.csv",
                 "/home/droman/Documents/diploma/deeppavlov_ner_3.6/data/output/normalized/data_semi_preprocess_without_stopwords.csv"]:
        result_data =  pd.read_csv(file_path)
        result_x_columns, result_scores = get_classifier_score(result_data)
        stored_structure = {"data_type": get_filename(file_path), "x_columns": result_x_columns, "scores": result_scores}
        json_to_save["items"].append(stored_structure)
    with open(output, 'w') as outfile:
        json.dump(json_to_save, outfile)


def run_for_all_together(output="/home/droman/Documents/diploma/deeppavlov_ner_3.6/data/output/classification/scores.json"):
    json_to_save = {"items": []}
    files = ["/home/droman/Documents/diploma/deeppavlov_ner_3.6/data/output/normalized/data_origin.csv",
             "/home/droman/Documents/diploma/deeppavlov_ner_3.6/data/output/normalized/data_preprocess.csv",
             "/home/droman/Documents/diploma/deeppavlov_ner_3.6/data/output/normalized/data_semi_preprocc_with_stopwords.csv",
             "/home/droman/Documents/diploma/deeppavlov_ner_3.6/data/output/normalized/data_semi_preprocess_without_stopwords.csv"]

    result_data =  pd.read_csv(files[0])
    origin_x_columns = ["tfidf_distance", "laser_distance", "use_distance", "fasttext_distance", "bert_distance", "ner_custom_overlap", "ner_overlap"]
    new_x_columns = []
    for file in files:
        feature_prefix = get_filename(file)
        data =  pd.read_csv(file)
        for x_column in origin_x_columns:
            new_column = feature_prefix+"/"+x_column
            result_data[new_column] = data[x_column]
            new_x_columns.append(new_column)

    result_data = result_data.dropna()

    result_x_columns, result_scores = get_classifier_score(result_data, x_columns=new_x_columns)
    stored_structure = {"x_columns": result_x_columns, "scores": result_scores}
    json_to_save["items"].append(stored_structure)
    with open(output, 'w') as outfile:
        json.dump(json_to_save, outfile)


In [27]:
output_dir="/home/droman/Documents/diploma/deeppavlov_ner_3.6/data/output/classification/scores_together.json"
run_for_all_together(output=output_dir)

100%|██████████| 28/28 [00:03<00:00,  7.78it/s]
