In [1]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.metrics import accuracy_score
import json
import os
import random

In [2]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)

In [3]:
def load_embedding(em_path):
    char2vec_file = os.path.join(em_path, 'char2vec_file.mat.npy')
    word2id_file=os.path.join(em_path, 'word2id.npy')

    char2vec_mat=np.load(char2vec_file)
    word2id=np.load(word2id_file).tolist()
    id2word={id:word for id, word in enumerate(word2id)}

    return char2vec_mat, word2id, id2word

In [4]:
def prepare_data(data_path, model_path, type, split):
    all_labels_intent = ['creation', "modification", "deletion", "retrieval"]
    all_lables_element = ['ceiling', 'column', 'door', 'floor', 'ramp', 'roof', 'stair', 'wall', 'window']
    
    if type == 'intent':
        all_labels = all_labels_intent
    elif type == 'element':
        all_labels = all_lables_element
    else:
        raise ValueError("type should be either 'intent' or 'element'")

    label2idx = {tag: idx for idx, tag in enumerate(all_labels)}
    idx2label = {idx: tag for idx, tag in enumerate(all_labels)}

    with open(os.path.join(data_path, split + ".json"), 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    vec_mat, word2id, id2word=load_embedding(model_path)
    vec_mat = vec_mat
    word2id = word2id
    id2word = id2word

    texts = []
    labels = []
    for d in data:
        texts.append(d['text'])
        if type == 'intent':
            labels.append(d['label']['intent'])
        elif type == 'element':
            labels.append(d['label']['element'])
        else:
            raise ValueError("type should be either 'intent' or 'element'")
    
    labels = [label2idx[label] for label in labels]
    
    return texts, labels
    

In [5]:
def sample_data(X_train, Y_train, sample_ratio, seed):
    if sample_ratio > 0:
        random.seed(seed)
        sample_num = int(len(X_train) * sample_ratio)
        sample_idx = random.sample(range(len(X_train)), sample_num)
        X_train = [X_train[i] for i in sample_idx]
        Y_train = [Y_train[i] for i in sample_idx]
    return X_train, Y_train

In [6]:
def convert_to_array(texts, labels, em_path):
    vec_mat, word2id, id2word = load_embedding(em_path)
    vec_mat = vec_mat
    word2id = word2id
    id2word = id2word

    X = []
    for text in texts:
        text = text.split(' ')
        vec = np.zeros((len(text), vec_mat.shape[1]))
        for i, word in enumerate(text):
            if word in word2id:
                vec[i] = vec_mat[word2id.index(word)]
            else:
                vec[i] = vec_mat[word2id.index('<UNK>')]
        vec = np.mean(vec, axis=0)
        X.append(vec)
    
    X = np.array(X)
    Y = np.array(labels)
    
    return X, Y

In [7]:
data_path = '../dataset/data/user'
em_path = './pretrained/en'

train_texts, train_labels = prepare_data(data_path, em_path, 'intent', 'train')
test_texts, test_labels = prepare_data(data_path, em_path, 'intent', 'test')
X_train_all, Y_train_all = convert_to_array(train_texts, train_labels, em_path)
X_test, Y_test = convert_to_array(test_texts, test_labels, em_path)

In [10]:
for sample_ratio in [1, 0.8, 0.6, 0.4, 0.2, 0.1, 0.05, 0.02, 0.01]:
    for seed in range(1, 6):
        X_train, Y_train = sample_data(X_train_all, Y_train_all, sample_ratio, seed)

        clf = svm.SVC(kernel='linear', C=1.0, random_state=seed)
        clf.fit(X_train, Y_train)
        Y_pred = clf.predict(X_test)
        acc = accuracy_score(Y_test, Y_pred)
        print(f"sample_ratio: {sample_ratio}, seed: {seed}, accuracy: {acc}")

sample_ratio: 1, seed: 1, accuracy: 0.9962962962962963
sample_ratio: 1, seed: 2, accuracy: 0.9962962962962963
sample_ratio: 1, seed: 3, accuracy: 0.9962962962962963
sample_ratio: 1, seed: 4, accuracy: 0.9962962962962963
sample_ratio: 1, seed: 5, accuracy: 0.9962962962962963
sample_ratio: 0.8, seed: 1, accuracy: 0.9944444444444445
sample_ratio: 0.8, seed: 2, accuracy: 0.9981481481481481
sample_ratio: 0.8, seed: 3, accuracy: 1.0
sample_ratio: 0.8, seed: 4, accuracy: 0.9981481481481481
sample_ratio: 0.8, seed: 5, accuracy: 0.9962962962962963
sample_ratio: 0.6, seed: 1, accuracy: 0.9925925925925926
sample_ratio: 0.6, seed: 2, accuracy: 1.0
sample_ratio: 0.6, seed: 3, accuracy: 0.9981481481481481
sample_ratio: 0.6, seed: 4, accuracy: 0.9981481481481481
sample_ratio: 0.6, seed: 5, accuracy: 0.9925925925925926
sample_ratio: 0.4, seed: 1, accuracy: 0.9925925925925926
sample_ratio: 0.4, seed: 2, accuracy: 0.9962962962962963
sample_ratio: 0.4, seed: 3, accuracy: 0.9981481481481481
sample_ratio: 

In [11]:
data_path = '../dataset/data/user'
em_path = './pretrained/en'

train_texts, train_labels = prepare_data(data_path, em_path, 'element', 'train')
test_texts, test_labels = prepare_data(data_path, em_path, 'element', 'test')
X_train_all, Y_train_all = convert_to_array(train_texts, train_labels, em_path)
X_test, Y_test = convert_to_array(test_texts, test_labels, em_path)

In [12]:
for sample_ratio in [1, 0.8, 0.6, 0.4, 0.2, 0.1, 0.05, 0.02, 0.01]:
    for seed in range(1, 6):
        X_train, Y_train = sample_data(X_train_all, Y_train_all, sample_ratio, seed)

        clf = svm.SVC(kernel='linear', C=1.0, random_state=seed)
        clf.fit(X_train, Y_train)
        Y_pred = clf.predict(X_test)
        acc = accuracy_score(Y_test, Y_pred)
        print(f"sample_ratio: {sample_ratio}, seed: {seed}, accuracy: {acc}")

sample_ratio: 1, seed: 1, accuracy: 0.9092592592592592
sample_ratio: 1, seed: 2, accuracy: 0.9092592592592592
sample_ratio: 1, seed: 3, accuracy: 0.9092592592592592
sample_ratio: 1, seed: 4, accuracy: 0.9092592592592592
sample_ratio: 1, seed: 5, accuracy: 0.9092592592592592
sample_ratio: 0.8, seed: 1, accuracy: 0.9074074074074074
sample_ratio: 0.8, seed: 2, accuracy: 0.912962962962963
sample_ratio: 0.8, seed: 3, accuracy: 0.9111111111111111
sample_ratio: 0.8, seed: 4, accuracy: 0.9092592592592592
sample_ratio: 0.8, seed: 5, accuracy: 0.9111111111111111
sample_ratio: 0.6, seed: 1, accuracy: 0.8981481481481481
sample_ratio: 0.6, seed: 2, accuracy: 0.9
sample_ratio: 0.6, seed: 3, accuracy: 0.9111111111111111
sample_ratio: 0.6, seed: 4, accuracy: 0.9092592592592592
sample_ratio: 0.6, seed: 5, accuracy: 0.9055555555555556
sample_ratio: 0.4, seed: 1, accuracy: 0.8851851851851852
sample_ratio: 0.4, seed: 2, accuracy: 0.8796296296296297
sample_ratio: 0.4, seed: 3, accuracy: 0.8981481481481481
