In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, cohen_kappa_score, balanced_accuracy_score
from itertools import combinations

import warnings
warnings.filterwarnings("ignore")

In [2]:
w2v_data = pd.read_csv('../Word2Vec/w2v_eli.csv', index_col=None)
base_feature = ['nrdirhypers_x',
                'nrhypos_x',
                'nrpartrels_normalised_x',
                'depthfromtopsynset_normalised_x',
                'glosslength_normalised_x',
                'minwordlength_x',
                'nroflemmas_x',
                'polyscore_max_x']
target = ['vote_x']

w2v_data

Unnamed: 0,Synsets,domain_x,nrdirhypers_x,nrhypos_x,nrpartrels_normalised_x,depthfromtopsynset_normalised_x,glosslength_normalised_x,minwordlength_x,nroflemmas_x,polyscore_max_x,...,vec_291,vec_292,vec_293,vec_294,vec_295,vec_296,vec_297,vec_298,vec_299,dis_c_d
0,Synset('adjustable_wrench.n.01'),tool,1,7,0.0,1.012903,0.563173,17,2,1,...,0.0146,-0.0488,0.0469,-0.0441,0.0290,0.0741,-0.0575,-0.0075,-0.0259,0.484202
1,Synset('allen_wrench.n.01'),tool,1,0,0.0,1.012903,0.391092,12,1,1,...,0.0211,-0.0197,0.0557,-0.0591,-0.0069,0.0317,-0.0198,-0.0043,-0.0739,0.549829
2,Synset('alligator_wrench.n.01'),tool,1,0,0.0,1.012903,1.517437,16,1,1,...,0.0185,-0.0215,0.0354,-0.0740,0.0203,0.0379,-0.0331,-0.0180,-0.0362,0.431030
3,Synset('awl.n.01'),tool,1,2,15.7,0.911613,0.985552,3,1,1,...,0.0543,-0.0515,-0.0017,-0.0101,0.0345,0.1180,0.0885,-0.0592,0.0560,0.327994
4,Synset('backsaw.n.01'),tool,1,0,0.0,1.114194,1.110701,7,2,1,...,-0.0457,-0.0741,0.0207,-0.0395,0.0297,-0.0542,-0.1040,-0.0326,0.0052,0.362438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
771,Synset('ballet_skirt.n.01'),garm,1,0,0.0,0.947552,0.578283,4,2,2,...,0.0215,-0.0879,0.0333,-0.0301,-0.0218,-0.0342,0.0383,0.0045,-0.0346,0.263475
772,Synset('mess_jacket.n.01'),garm,1,0,0.0,1.158120,1.652238,11,3,1,...,0.0092,-0.0397,-0.0152,-0.0135,-0.0304,0.0031,-0.0402,-0.0054,-0.0153,0.572053
773,Synset('long_johns.n.01'),garm,1,0,0.0,1.052836,0.479149,10,1,1,...,-0.0458,-0.0743,0.0254,-0.0212,-0.0433,-0.0129,0.0235,-0.0929,0.0031,0.464427
774,Synset('undies.n.01'),garm,1,0,0.0,1.158120,0.280880,6,1,1,...,-0.0686,-0.0497,0.0083,-0.0549,-0.0501,0.0107,0.0060,-0.0205,0.0133,0.501280


# Benchmark: Random Forest: W2V distance

## GlobalModel Test

In [4]:
# split training set and testing set using K-Flod
def global_model_test(dataset, feature, synthetics, target):
    K = 10
    random_seed = 7 # R
    data = dataset.reset_index()
    feature_list = feature + synthetics
    X = data[feature_list]
    y = data[target]

    K_Flod = StratifiedKFold(n_splits=K, shuffle=True, random_state=random_seed)
    K_Flod.get_n_splits(X, y)
    cohen_kappa = []
    balanced_acc = []
    for train_index, test_index in K_Flod.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # SMOTE algorithm
        smote = SMOTE(random_state=random_seed, k_neighbors=2)
        X_train, y_train = smote.fit_resample(X_train, y_train)

        # define random forest model
        rf = RandomForestClassifier(random_state=random_seed, max_features='sqrt', n_estimators=1400, min_samples_split=2, min_samples_leaf=1, max_depth=50, oob_score=True, criterion='gini', bootstrap=True).fit(X_train, y_train)

        # predict and make score
        pipeline = make_pipeline(smote, rf)
        y_pred = pipeline.predict(X_test)

        kappa = cohen_kappa_score(y_test, y_pred)
        cohen_kappa.append(kappa)
        balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
        balanced_acc.append(balanced_accuracy)

    results = []

    results.append(np.mean(cohen_kappa))
    results.append(np.mean(balanced_acc))

    return results

In [7]:
global_model_test(w2v_data, base_feature, ['ngram_50y_max', 'ngram_100y_max', 'ngram_400y_mean', 'ngram_500y_max', 'dis_c_d'], target)

[0.7044099768267154, 0.8565662166505946]

## LocalModel Test

In [11]:
# generate the local dataframe for different domains
local_fruit = w2v_data.loc[w2v_data['domain_x']=='fruit']
local_tool = w2v_data.loc[w2v_data['domain_x']=='tool']
local_music = w2v_data.loc[w2v_data['domain_x']=='music']
local_furniture = w2v_data.loc[w2v_data['domain_x']=='furn']
local_garments = w2v_data.loc[w2v_data['domain_x']=='garm']

local_list = [local_fruit, local_tool, local_music, local_furniture, local_garments]

def local_model_test(dataset_list, base_feature, synthetics, target):
    random_seed = 7 # R
    K = 10
    feature_list = base_feature + synthetics

    local_kappa_list = []
    local_balancedAcc_list = []
    for dataset in dataset_list:
        X = dataset[feature_list]
        y = dataset[target]
        K_Flod = StratifiedKFold(n_splits=K, shuffle=True, random_state=random_seed)
        K_Flod.get_n_splits(X, y)
        cohen_kappa = []
        balanced_acc = []
        for train_index, test_index in K_Flod.split(X, y):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # SMOTE algorithm
            smote = SMOTE(random_state=random_seed, k_neighbors=2)
            X_train, y_train = smote.fit_resample(X_train, y_train)

            # define random forest model
            rf = RandomForestClassifier(random_state=random_seed, max_features='sqrt', n_estimators=1400, min_samples_split=2, min_samples_leaf=1, max_depth=50, oob_score=True, criterion='gini', bootstrap=True).fit(X_train, y_train)

            # predict and make score
            pipeline = make_pipeline(smote, rf)
            y_pred = pipeline.predict(X_test)

            kappa = cohen_kappa_score(y_test, y_pred)
            cohen_kappa.append(kappa)
            balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
            balanced_acc.append(balanced_accuracy)

        cohen_kappa_mean = np.mean(cohen_kappa)
        balanced_acc_mean = np.mean(balanced_acc)
        local_kappa_list.append(cohen_kappa_mean)
        local_balancedAcc_list.append(balanced_acc_mean)

    local_kappa = np.mean(local_kappa_list)
    local_acc = np.mean(local_balancedAcc_list)

    return local_kappa, local_acc

In [12]:
local_model_test(local_list, base_feature, ['ngram_100y_max', 'ngram_400y_mean', 'dis_c_d'], target)

(0.7061741962510757, 0.8557261904761907)

## TransferModel Test

In [14]:
# generate the local dataframe for different domains
local_fruit = w2v_data.loc[w2v_data['domain_x']=='fruit']
local_tool = w2v_data.loc[w2v_data['domain_x']=='tool']
local_music = w2v_data.loc[w2v_data['domain_x']=='music']
local_furniture = w2v_data.loc[w2v_data['domain_x']=='furn']
local_garments = w2v_data.loc[w2v_data['domain_x']=='garm']

local_list = [local_fruit, local_tool, local_music, local_furniture, local_garments]

# generate the unseen(transfer) dataframe for different domains
unseen_fruit = w2v_data.loc[w2v_data['domain_x']!='fruit']
unseen_tool = w2v_data.loc[w2v_data['domain_x']!='tool']
unseen_music = w2v_data.loc[w2v_data['domain_x']!='music']
unseen_furniture = w2v_data.loc[w2v_data['domain_x']!='furn']
unseen_garments = w2v_data.loc[w2v_data['domain_x']!='garm']

unseen_list = [unseen_fruit, unseen_tool, unseen_music, unseen_furniture, unseen_garments]

def transfer_model_test(train_list, test_list, base_feature, synthetics, target):
    random_seed = 7 # R
    transfer_kappa_list = []
    transfer_balancedAcc_list = []
    for counter in range(len(train_list)):
        training_data = train_list[counter].reset_index()
        testing_data = test_list[counter].reset_index()
        feature_list = base_feature + synthetics
        X_train = training_data[feature_list]
        y_train = training_data[target]
        X_test = testing_data[feature_list]
        y_test = testing_data[target]

        # SMOTE algorithm
        smote = SMOTE(random_state=random_seed, k_neighbors=2)
        X_train, y_train = smote.fit_resample(X_train, y_train)

        # define random forest model
        rf = RandomForestClassifier(random_state=random_seed, max_features='sqrt', n_estimators=1400, min_samples_split=2, min_samples_leaf=1, max_depth=50, oob_score=True, criterion='gini', bootstrap=True).fit(X_train, y_train)

        # predict and make score
        pipeline = make_pipeline(smote, rf)
        y_pred = pipeline.predict(X_test)

        kappa = cohen_kappa_score(y_test, y_pred)
        balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

        transfer_kappa_list.append(kappa)
        transfer_balancedAcc_list.append(balanced_accuracy)

    transfer_kappa = np.mean(transfer_kappa_list), np.mean(transfer_balancedAcc_list)


    return transfer_kappa

In [15]:
transfer_model_test(unseen_list, local_list, base_feature, ['ngram_5y_mean', 'ngram_50y_mean', 'ngram_100y_mean', 'ngram_200y_max', 'ngram_500y_max', 'dis_c_d'], target)

(0.5722599214246016, 0.8206065936685037)