In [4]:
import pandas as pd
import numpy as np

import ast

from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, cohen_kappa_score, balanced_accuracy_score, precision_score, f1_score
from scipy.special import comb

import warnings
warnings.filterwarnings("ignore")

In [5]:
# preparing for the dataset
raw_data = pd.read_csv('./translated_raw.csv', index_col=None)
## Translation set
translation_data = raw_data.copy()
translation_data['translation_set'] = translation_data['raw_translation'].apply(lambda translation_list: list(set(translation_list.split())))
## Number, length
translation_data['number_tranlsation'] = translation_data['raw_translation'].apply(lambda translation_list: len(translation_list.split()))
translation_data['number_sense'] = translation_data['translation_set'].apply(lambda sense_list: len(sense_list))
## lemmas
def flattenTranslation(dataframe):
    raw = dataframe['raw_translation_lemmas']
    listTranslation = ast.literal_eval(raw)
    processed = ''
    for s in listTranslation:
        processed += s
    return processed
translation_data['raw_translation_lemmas'] = translation_data.apply(flattenTranslation, axis=1)
## Translation set
translation_data['translation_lemmas_set'] = translation_data['raw_translation_lemmas'].apply(lambda translation_list: list(set(translation_list.split())))

## Number, length
translation_data['number_tranlsation_lemmas'] = translation_data['raw_translation_lemmas'].apply(lambda translation_list: len(translation_list.split()))
translation_data['number_sense_lemmas'] = translation_data['translation_lemmas_set'].apply(lambda sense_list: len(sense_list))

semantic_data = translation_data
semantic_data

Unnamed: 0,Synsets,domain_x,norm,nrdirhypers_x,nrhypos_x,nrpartrels_normalised_x,depthfromtopsynset_normalised_x,glosslength_normalised_x,minwordlength_x,nroflemmas_x,polyscore_max_x,vote_x,raw_translation,raw_translation_lemmas,translation_set,number_tranlsation,number_sense,translation_lemmas_set,number_tranlsation_lemmas,number_sense_lemmas
0,Synset('adjustable_wrench.n.01'),tool,adjustable_wrench,1,7,0.0,1.012903,0.563173,17,2,1,nb,adjust change change hold hold hold screw tool,adjust change change hold hold hold screw tool...,"[adjust, screw, tool, hold, change]",8,5,"[rotate, adjust, screw, tool, hold, angle, cha...",15,7
1,Synset('allen_wrench.n.01'),tool,allen_wrench,1,0,0.0,1.012903,0.391092,12,1,1,nb,break hold hold hold metal screw tool,break hold hold hold metal screw tool,"[metal, screw, tool, hold, break]",7,5,"[metal, screw, tool, hold, break]",7,5
2,Synset('alligator_wrench.n.01'),tool,alligator_wrench,1,0,0.0,1.012903,1.517437,16,1,1,nb,animal claw claw claw chew chew chew eat teeth...,animal claw claw claw chew chew chew eat teeth...,"[chew, claw, eat, animal, fin, teeth]",15,6,"[chew, claw, eat, animal, fin, teeth]",15,6
3,Synset('awl.n.01'),tool,awl,1,2,15.7,0.911613,0.985552,3,1,1,b,cloth cloth hand knit knit wool,cloth cloth hand knit knit wool,"[knit, hand, cloth, wool]",6,4,"[knit, hand, cloth, wool]",6,4
4,Synset('backsaw.n.01'),tool,backsaw,1,0,0.0,1.114194,1.110701,7,2,1,nb,cut blade blade blade cut edge edge hand heave...,cut blade blade blade cut edge edge hand heave...,"[metal, large, edge, blade, heave, hand, cut]",13,7,"[metal, large, edge, blade, heave, hand, cut, ...",19,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
834,Synset('ballet_skirt.n.01'),garm,ballet_skirt,1,0,0.0,0.947552,0.578283,4,2,2,nb,act ballet ballet ballet dance dance dance dre...,act ballet ballet ballet dance dance dance dre...,"[high, ballet, dress, dance, leg, long, skirt,...",13,8,"[high, ballet, dress, teach, dance, leg, long,...",17,10
835,Synset('mess_jacket.n.01'),garm,mess_jacket,1,0,0.0,1.158120,1.652238,11,3,1,nb,clean cloth cloth cloth cover mess mess,clean cloth cloth cloth cover mess mess animal...,"[mess, clean, cloth, cover]",7,4,"[human, cloth, color, longbody, cover, mess, w...",34,14
836,Synset('long_johns.n.01'),garm,long_johns,1,0,0.0,1.052836,0.479149,10,1,1,nb,bend cloth cloth cloth cover cloth cloth comfo...,bend cloth cloth cloth cover cloth cloth comfo...,"[cotton, cloth, cover, material, long, bend, c...",14,7,"[cotton, cloth, cover, material, long, bend, c...",14,7
837,Synset('undies.n.01'),garm,undies,1,0,0.0,1.158120,0.280880,6,1,1,nb,cloth cloth cloth cover cloth cover comfort co...,cloth cloth cloth cover cloth cover comfort co...,"[cotton, cloth, cover, pants, warm, material, ...",14,7,"[cotton, cloth, cover, pants, warm, material, ...",14,7


In [7]:
base_feature = ['nrdirhypers_x',
                'nrhypos_x',
                'nrpartrels_normalised_x',
                'depthfromtopsynset_normalised_x',
                'glosslength_normalised_x',
                'minwordlength_x',
                'nroflemmas_x',
                'polyscore_max_x']
target = ['vote_x']

In [8]:
def cueSet(dataframe):
    translation_basic_level = dataframe.loc[dataframe['vote_x']=='b']['translation_lemmas_set']
    translation_basic_level = translation_basic_level.reset_index()['translation_lemmas_set']
    cue_set = set()
    for trans in translation_basic_level:
        cue_set = cue_set | set(trans)
    cue_len = len(cue_set)

    translation_non_basic_level = dataframe.loc[dataframe['vote_x']=='nb']['translation_lemmas_set']
    translation_non_basic_level = translation_non_basic_level.reset_index()['translation_lemmas_set']
    non_basic_level_cue_set = set()
    for trans in translation_non_basic_level:
        non_basic_level_cue_set = non_basic_level_cue_set | set(trans)

    basic_level_and_cue = cue_set - non_basic_level_cue_set
    basic_level_and_cue_len = len(basic_level_and_cue)
    return cue_set, cue_len, basic_level_and_cue, basic_level_and_cue_len

In [71]:
## Cue validity while training and testing
def calCV(feature_list, cue_set, cue_len, basic_level_and_cue, basic_level_and_cue_len):
    acc_cv = 0
    bl_cue = 0
    cue = 0
    for feature in feature_list:
        if feature in basic_level_and_cue:
            bl_cue += 1
        if feature in cue_set:
            cue += 1
    if bl_cue != 0:
        p_bl_cue = comb(basic_level_and_cue_len, bl_cue)
        p_cue = comb(cue_len, cue)
        acc_cv = p_bl_cue/p_cue
    return acc_cv


In [9]:
## alternative
def alt_calCV(feature_list, cue_set, cue_len, basic_level_and_cue, basic_level_and_cue_len):
    acc_cv = 0
    for feature in feature_list:
        if feature in basic_level_and_cue:
            ac = basic_level_and_cue_len / cue_len
            acc_cv += ac
    return acc_cv

## GlobalModel Test

In [10]:
# split training set and testing set using K-Flod
def global_model_test(dataset, feature, new, target):
    K = 10
    random_seed = 7 # R
    data = dataset.reset_index()
    comp = ['translation_lemmas_set', 'vote_x']
    X = data[feature+comp+new]
    y = data[target]

    K_Flod = StratifiedKFold(n_splits=K, shuffle=True, random_state=random_seed)
    K_Flod.get_n_splits(X, y)
    cohen_kappa = []
    balanced_acc = []
    for train_index, test_index in K_Flod.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        cue_set_train, cue_len_train, basic_level_and_cue_train, basic_level_and_cue_len_train = cueSet(X_train)
        X_train['cv'] = X_train['translation_lemmas_set'].apply(lambda l: alt_calCV(l, cue_set_train, cue_len_train, basic_level_and_cue_train, basic_level_and_cue_len_train))
        X_train = X_train.drop(comp, axis=1)
        cue_set_test, cue_len_test, basic_level_and_cue_test, basic_level_and_cue_len_test = cueSet(X_test)
        X_test['cv'] = X_test['translation_lemmas_set'].apply(lambda l: alt_calCV(l, cue_set_test, cue_len_test, basic_level_and_cue_test, basic_level_and_cue_len_test))
        X_test = X_test.drop(comp, axis=1)

        # SMOTE algorithm
        smote = SMOTE(random_state=random_seed, k_neighbors=2)
        X_train, y_train = smote.fit_resample(X_train, y_train)

        # define random forest model
        rf = RandomForestClassifier(random_state=random_seed, max_features='sqrt', n_estimators=1400, min_samples_split=2, min_samples_leaf=1, max_depth=50, oob_score=True, criterion='gini', bootstrap=True).fit(X_train, y_train)

        # predict and make score
        pipeline = make_pipeline(smote, rf)
        y_pred = pipeline.predict(X_test)

        kappa = cohen_kappa_score(y_test, y_pred)
        cohen_kappa.append(kappa)
        balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
        balanced_acc.append(balanced_accuracy)

    results = []

    results.append(np.mean(cohen_kappa))
    results.append(np.mean(balanced_acc))

    return results

In [77]:
global_model_test(semantic_data, base_feature, ['number_sense_lemmas'], target)

[0.6893847386406862, 0.8478374615336863]

In [11]:
global_model_test(semantic_data, base_feature, ['number_sense_lemmas'], target)

[0.8807699243368668, 0.9549145981323329]

## LocalModel Test

In [12]:
# generate the local dataframe for different domains
local_fruit = semantic_data.loc[semantic_data['domain_x']=='fruit']
local_tool = semantic_data.loc[semantic_data['domain_x']=='tool']
local_music = semantic_data.loc[semantic_data['domain_x']=='music']
local_furniture = semantic_data.loc[semantic_data['domain_x']=='furn']
local_garments = semantic_data.loc[semantic_data['domain_x']=='garm']

local_list = [local_fruit, local_tool, local_music, local_furniture, local_garments]

# average results of five domains
def local_model_test(dataset_list, feature, new, target):
    local_kappa_list = []
    local_balancedAcc_list = []
    for local_dataset in dataset_list:
        result_list = global_model_test(local_dataset, feature, new, target)
        local_kappa_list.append(result_list[0])
        local_balancedAcc_list.append(result_list[1])

    local_result_list = [np.mean(local_kappa_list), np.mean(local_balancedAcc_list)]
    return local_result_list

In [85]:
local_model_test(local_list, base_feature, ['number_sense_lemmas'], target)

[0.7640620812741172, 0.8950135281385281]

In [13]:
local_model_test(local_list, base_feature, ['number_sense_lemmas'], target)

[0.89768020147039, 0.952056277056277]

## TransferModel Test

In [14]:
# generate the unseen(transfer) dataframe for different domains
unseen_fruit = semantic_data.loc[semantic_data['domain_x']!='fruit']
unseen_tool = semantic_data.loc[semantic_data['domain_x']!='tool']
unseen_music = semantic_data.loc[semantic_data['domain_x']!='music']
unseen_furniture = semantic_data.loc[semantic_data['domain_x']!='furn']
unseen_garments = semantic_data.loc[semantic_data['domain_x']!='garm']

unseen_list = [unseen_fruit, unseen_tool, unseen_music, unseen_furniture, unseen_garments]

def transfer_model_test(train_list, test_list, feature, new, target):
    random_seed = 7 # R
    transfer_kappa_list = []
    transfer_balancedAcc_list = []
    for counter in range(len(train_list)):
        training_data = train_list[counter].reset_index()
        testing_data = test_list[counter].reset_index()
        comp = ['translation_lemmas_set', 'vote_x']
        feature_list = feature + comp + new
        X_train = training_data[feature_list]
        y_train = training_data[target]
        X_test = testing_data[feature_list]
        y_test = testing_data[target]

        cue_set_train, cue_len_train, basic_level_and_cue_train, basic_level_and_cue_len_train = cueSet(X_train)
        X_train['cv'] = X_train['translation_lemmas_set'].apply(lambda l: alt_calCV(l, cue_set_train, cue_len_train, basic_level_and_cue_train, basic_level_and_cue_len_train))
        X_train = X_train.drop(comp, axis=1)
        cue_set_test, cue_len_test, basic_level_and_cue_test, basic_level_and_cue_len_test = cueSet(X_test)
        X_test['cv'] = X_test['translation_lemmas_set'].apply(lambda l: alt_calCV(l, cue_set_test, cue_len_test, basic_level_and_cue_test, basic_level_and_cue_len_test))
        X_test = X_test.drop(comp, axis=1)

        # SMOTE algorithm
        smote = SMOTE(random_state=random_seed, k_neighbors=2)
        X_train, y_train = smote.fit_resample(X_train, y_train)

        # define random forest model
        rf = RandomForestClassifier(random_state=random_seed, max_features='sqrt', n_estimators=1400, min_samples_split=2, min_samples_leaf=1, max_depth=50, oob_score=True, criterion='gini', bootstrap=True).fit(X_train, y_train)

        # predict and make score
        pipeline = make_pipeline(smote, rf)
        y_pred = pipeline.predict(X_test)

        kappa = cohen_kappa_score(y_test, y_pred)
        balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

        transfer_kappa_list.append(kappa)
        transfer_balancedAcc_list.append(balanced_accuracy)

    transfer_results = [np.mean(transfer_kappa_list), np.mean(transfer_balancedAcc_list)]

    return transfer_results

In [92]:
transfer_model_test(unseen_list, local_list, base_feature, ['number_tranlsation_lemmas'], target)

[0.5437985568770921, 0.8103873047096235]

In [16]:
transfer_model_test(unseen_list, local_list, base_feature, ['number_tranlsation_lemmas'], target)

[0.7002119979627016, 0.8823192043829493]

In [15]:
transfer_model_test(unseen_list, local_list, base_feature, ['number_sense_lemmas'], target)

[0.7053558792446359, 0.8947769221355127]