In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, cohen_kappa_score, balanced_accuracy_score, precision_score, f1_score

from sklearn.preprocessing import MinMaxScaler
from sklearn import svm

import warnings
warnings.filterwarnings("ignore")

In [142]:
# preparing for the dataset
w2v_dataset = pd.read_csv('./w2v_eli.csv', index_col=None)
w2v_dataset


Unnamed: 0,Synsets,domain_x,nrdirhypers_x,nrhypos_x,nrpartrels_normalised_x,depthfromtopsynset_normalised_x,glosslength_normalised_x,minwordlength_x,nroflemmas_x,polyscore_max_x,...,ngram_200y_max,ngram_400y_mean,ngram_400y_max,ngram_500y_mean,ngram_500y_max,norm,dist_min,dist_mean,dist_max,dist_std
0,Synset('adjustable_wrench.n.01'),tool,1,7,0.0,1.012903,0.563173,17,2,1,...,5.067315e-08,6.919673e-09,5.067315e-08,5.336133e-09,5.067315e-08,adjustable_wrench,-0.082435,0.214812,0.878926,0.295497
1,Synset('allen_wrench.n.01'),tool,1,0,0.0,1.012903,0.391092,12,1,1,...,6.483798e-08,3.867145e-09,6.483798e-08,2.982164e-09,6.483798e-08,allen_wrench,-0.042280,0.226570,0.875432,0.295538
2,Synset('alligator_wrench.n.01'),tool,1,0,0.0,1.012903,1.517437,16,1,1,...,1.499339e-08,6.203393e-10,1.499339e-08,4.783770e-10,1.499339e-08,alligator_wrench,-0.084978,0.207159,0.867023,0.289473
3,Synset('awl.n.01'),tool,1,2,15.7,0.911613,0.985552,3,1,1,...,2.510665e-06,6.210211e-07,6.602901e-06,5.069033e-07,6.602901e-06,awl,-0.084815,0.106211,0.450463,0.144857
4,Synset('backsaw.n.01'),tool,1,0,0.0,1.114194,1.110701,7,2,1,...,3.507272e-08,9.018908e-09,2.260505e-07,6.954965e-09,2.260505e-07,backsaw,-0.110515,0.178268,0.883533,0.321175
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
775,Synset('ballet_skirt.n.01'),garm,1,0,0.0,0.947552,0.578283,4,2,2,...,8.004256e-07,1.136149e-07,8.004256e-07,9.223624e-08,1.686392e-06,ballet_skirt,-0.177788,0.059377,0.547366,0.176403
776,Synset('mess_jacket.n.01'),garm,1,0,0.0,1.158120,1.652238,11,3,1,...,5.083992e-08,6.963715e-09,5.547362e-08,5.370096e-09,5.547362e-08,mess_jacket,-0.067792,0.301306,0.957686,0.309748
777,Synset('long_johns.n.01'),garm,1,0,0.0,1.052836,0.479149,10,1,1,...,1.103470e-07,6.987224e-09,1.103470e-07,5.388225e-09,1.103470e-07,long_johns,-0.094436,0.258654,0.830205,0.327961
778,Synset('undies.n.01'),garm,1,0,0.0,1.158120,0.280880,6,1,1,...,2.565946e-07,2.025520e-08,3.896998e-07,1.561987e-08,3.896998e-07,undies,-0.041136,0.323927,0.953495,0.338978


In [143]:
# define features and target
features_base = ['nrdirhypers_x',
                 'nrhypos_x',
                 'nrpartrels_normalised_x',
                 'depthfromtopsynset_normalised_x',
                 'glosslength_normalised_x',
                 'minwordlength_x',
                 'nroflemmas_x',
                 'polyscore_max_x']
target = ['vote_x'] # nb / b
distance = ['dist_min', 'dist_mean', 'dist_max', 'dist_std']
vec_col = ['vec_'+str(x) for x in range(300)]

ngram_feature_list = ['ngram_1y_mean', 'ngram_1y_max',
                          'ngram_5y_mean', 'ngram_5y_max',
                          'ngram_10y_mean', 'ngram_10y_max',
                          'ngram_20y_mean', 'ngram_20y_max',
                          'ngram_50y_mean', 'ngram_50y_max',
                          'ngram_100y_mean', 'ngram_100y_max',
                          'ngram_200y_mean', 'ngram_200y_max',
                          'ngram_400y_mean', 'ngram_400y_max',
                          'ngram_500y_mean', 'ngram_500y_max']



In [10]:
normalized_col = features_base + ngram_feature_list + vec_col
w2v_normalized = w2v_dataset[normalized_col]
normalize_scaler = MinMaxScaler()
w2v_normalized = normalize_scaler.fit_transform(w2v_normalized)
w2v_normalized = pd.DataFrame(w2v_normalized, columns=normalized_col)

string_col = w2v_dataset[['Synsets', 'domain_x', 'vote_x']]
w2v_normalized = pd.concat([string_col, w2v_normalized], axis=1)

w2v_normalized

Unnamed: 0,Synsets,domain_x,vote_x,nrdirhypers_x,nrhypos_x,nrpartrels_normalised_x,depthfromtopsynset_normalised_x,glosslength_normalised_x,minwordlength_x,nroflemmas_x,...,vec_290,vec_291,vec_292,vec_293,vec_294,vec_295,vec_296,vec_297,vec_298,vec_299
0,Synset('adjustable_wrench.n.01'),tool,nb,0.0,0.02583,0.00000,0.511681,0.132594,0.777778,0.166667,...,0.360181,0.571163,0.288111,0.734902,0.340552,0.559278,0.681165,0.225244,0.402111,0.351121
1,Synset('allen_wrench.n.01'),tool,nb,0.0,0.00000,0.00000,0.511681,0.085765,0.500000,0.000000,...,0.615411,0.595318,0.394238,0.765801,0.276858,0.427099,0.528756,0.385465,0.413755,0.180363
2,Synset('alligator_wrench.n.01'),tool,nb,0.0,0.00000,0.00000,0.511681,0.392283,0.722222,0.000000,...,0.563110,0.585656,0.387673,0.694522,0.213588,0.527246,0.551042,0.328942,0.363901,0.314479
3,Synset('awl.n.01'),tool,b,0.0,0.00738,0.24139,0.348908,0.247538,0.000000,0.000000,...,0.412831,0.718692,0.278264,0.564256,0.484926,0.579529,0.838965,0.845729,0.213974,0.642476
4,Synset('backsaw.n.01'),tool,nb,0.0,0.00000,0.00000,0.674454,0.281596,0.222222,0.166667,...,0.436192,0.347083,0.195842,0.642907,0.360085,0.561856,0.219986,0.027624,0.310771,0.461757
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
771,Synset('ballet_skirt.n.01'),garm,nb,0.0,0.00000,0.00000,0.406662,0.136706,0.055556,0.166667,...,0.531032,0.596804,0.145514,0.687149,0.400000,0.372239,0.291876,0.632384,0.445779,0.320171
772,Synset('mess_jacket.n.01'),garm,nb,0.0,0.00000,0.00000,0.745043,0.428967,0.444444,0.333333,...,0.618550,0.551096,0.321298,0.516854,0.470488,0.340574,0.425953,0.298768,0.409753,0.388830
773,Synset('long_johns.n.01'),garm,nb,0.0,0.00000,0.00000,0.575853,0.109728,0.388889,0.000000,...,0.585774,0.346711,0.195113,0.659410,0.437792,0.293078,0.368440,0.569486,0.091339,0.454287
774,Synset('undies.n.01'),garm,nb,0.0,0.00000,0.00000,0.745043,0.055772,0.166667,0.000000,...,0.547420,0.261984,0.284829,0.599368,0.294692,0.268041,0.453271,0.495113,0.354803,0.490573


## GlobalModel Test

### Benchmark: Random Forest

In [150]:
# split training set and testing set using K-Flod
def global_model_test(dataset, feature, vec, target):
    K = 10
    random_seed = 7 # R
    data = dataset.reset_index()
    feature_list = feature + vec
    X = data[feature_list]
    y = data[target]

    K_Flod = StratifiedKFold(n_splits=K, shuffle=True, random_state=random_seed)
    K_Flod.get_n_splits(X, y)
    cohen_kappa = []
    balanced_acc = []
    for train_index, test_index in K_Flod.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # SMOTE algorithm
        smote = SMOTE(random_state=random_seed, k_neighbors=2)
        X_train, y_train = smote.fit_resample(X_train, y_train)

        # define random forest model
        rf = RandomForestClassifier(random_state=random_seed, max_features='sqrt', n_estimators=1400, min_samples_split=2, min_samples_leaf=1, max_depth=50, oob_score=True, criterion='gini', bootstrap=True).fit(X_train, y_train)

        # predict and make score
        pipeline = make_pipeline(smote, rf)
        y_pred = pipeline.predict(X_test)

        kappa = cohen_kappa_score(y_test, y_pred)
        cohen_kappa.append(kappa)
        balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
        balanced_acc.append(balanced_accuracy)

    results = []

    results.append(np.mean(cohen_kappa))
    results.append(np.mean(balanced_acc))

    return results

In [151]:
global_model_test(w2v_dataset, features_base, ['dist_max'], target)

[0.684858332515145, 0.8436451301832207]

In [152]:
global_model_test(w2v_dataset, features_base, ['dist_min'], target)

[0.7134661724818578, 0.858361727204543]

In [153]:
global_model_test(w2v_dataset, features_base, ['dist_mean'], target)

[0.7037666632841176, 0.8587948676738455]

In [154]:
global_model_test(w2v_dataset, features_base, ['dist_std'], target)

[0.7007468167728946, 0.8570274831243972]

In [97]:
global_model_test(w2v_dataset, features_base, distance, target)

[0.6631330879321169, 0.831074413371906]

In [5]:
global_model_test(w2v_dataset, features_base, vec_col, target)

[0.5422009666892509, 0.7413211186113788]

### Vector based: SVM

In [58]:
# split training set and testing set using K-Flod
def svm_global_model_test(dataset, feature, vec, target, cc, gamma_setting):
    K = 10
    random_seed = 7 # R
    data = dataset.reset_index()
    feature_list = feature + vec
    X = data[feature_list]
    y = data[target]

    K_Flod = StratifiedKFold(n_splits=K, shuffle=True, random_state=random_seed)
    K_Flod.get_n_splits(X, y)
    cohen_kappa = []
    balanced_acc = []
    for train_index, test_index in K_Flod.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # SVM
        svm_classifier = svm.SVC(C=cc, kernel='rbf', gamma=gamma_setting)

        # fit data
        svm_classifier.fit(X_train, y_train)

        # predict and make score
        y_pred = svm_classifier.predict(X_test)

        kappa = cohen_kappa_score(y_test, y_pred)
        cohen_kappa.append(kappa)
        balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
        balanced_acc.append(balanced_accuracy)

    results = []

    results.append(np.mean(cohen_kappa))
    results.append(np.mean(balanced_acc))

    return results

In [14]:
# grid search for SVM
max_kappa = 0
max_balanced_accuracy = 0
best_C = 0
besr_gamma = ''
for gamma_setting in ['auto', 'scale']:
    for cc in range(1, 201):
        res = svm_global_model_test(w2v_normalized, features_base, vec_col, target, cc, gamma_setting)
        if res[0] > max_kappa:
            max_kappa = res[0]
            max_balanced_accuracy = res[1]
            best_C = cc
            besr_gamma = gamma_setting

print('Best kappa = {0}, best balanced accuracy = {1}, C = {2}, gamma is {3}'.format(max_kappa, max_balanced_accuracy,best_C, besr_gamma))

Best kappa = 0.564415757672675, best balanced accuracy = 0.7688947819564984, C = 111, gamma is scale


## LocalModel Test

### Benchmark: Random Forest

In [155]:
# generate the local dataframe for different domains
local_fruit = w2v_dataset.loc[w2v_dataset['domain_x']=='fruit']
local_tool = w2v_dataset.loc[w2v_dataset['domain_x']=='tool']
local_music = w2v_dataset.loc[w2v_dataset['domain_x']=='music']
local_furniture = w2v_dataset.loc[w2v_dataset['domain_x']=='furn']
local_garments = w2v_dataset.loc[w2v_dataset['domain_x']=='garm']

local_list = [local_fruit, local_tool, local_music, local_furniture, local_garments]

# average results of five domains
def local_model_test(dataset_list, feature, distance, target):
    local_kappa_list = []
    local_balancedAcc_list = []
    for local_dataset in dataset_list:
        result_list = global_model_test(local_dataset, feature, distance, target)
        local_kappa_list.append(result_list[0])
        local_balancedAcc_list.append(result_list[1])

    local_result_list = [np.mean(local_kappa_list), np.mean(local_balancedAcc_list)]
    return local_result_list

In [156]:
local_model_test(local_list, features_base, ['dist_std'], target)

[0.6155352431605264, 0.8149603174603174]

In [157]:
local_model_test(local_list, features_base, ['dist_mean'], target)

[0.6380471507788009, 0.8224484126984126]

In [158]:
local_model_test(local_list, features_base, ['dist_min'], target)

[0.6392289675652201, 0.8270992063492063]

In [160]:
local_model_test(local_list, features_base, ['dist_min', 'dist_mean'], target)

[0.6483147051286842, 0.8299087301587301]

In [40]:
w2v_dataset

Unnamed: 0,Synsets,domain_x,nrdirhypers_x,nrhypos_x,nrpartrels_normalised_x,depthfromtopsynset_normalised_x,glosslength_normalised_x,minwordlength_x,nroflemmas_x,polyscore_max_x,...,ngram_200y_mean,ngram_200y_max,ngram_400y_mean,ngram_400y_max,ngram_500y_mean,ngram_500y_max,norm,dis_min,dist_mean,dist_max
0,Synset('adjustable_wrench.n.01'),tool,1,7,0.0,1.012903,0.563173,17,2,1,...,1.380492e-08,5.067315e-08,6.919673e-09,5.067315e-08,5.336133e-09,5.067315e-08,adjustable_wrench,0.416576,45.038897,0.484202
1,Synset('allen_wrench.n.01'),tool,1,0,0.0,1.012903,0.391092,12,1,1,...,7.715051e-09,6.483798e-08,3.867145e-09,6.483798e-08,2.982164e-09,6.483798e-08,allen_wrench,0.549829,54.982889,0.549829
2,Synset('alligator_wrench.n.01'),tool,1,0,0.0,1.012903,1.517437,16,1,1,...,1.237592e-09,1.499339e-08,6.203393e-10,1.499339e-08,4.783770e-10,1.499339e-08,alligator_wrench,0.431030,43.102965,0.431030
3,Synset('awl.n.01'),tool,1,2,15.7,0.911613,0.985552,3,1,1,...,5.869817e-07,2.510665e-06,6.210211e-07,6.602901e-06,5.069033e-07,6.602901e-06,awl,0.327994,32.799363,0.327994
4,Synset('backsaw.n.01'),tool,1,0,0.0,1.114194,1.110701,7,2,1,...,1.451245e-08,3.507272e-08,9.018908e-09,2.260505e-07,6.954965e-09,2.260505e-07,backsaw,0.290454,32.644600,0.362438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
775,Synset('ballet_skirt.n.01'),garm,1,0,0.0,0.947552,0.578283,4,2,2,...,1.531570e-07,8.004256e-07,1.136149e-07,8.004256e-07,9.223624e-08,1.686392e-06,ballet_skirt,0.198830,23.115268,0.263475
776,Synset('mess_jacket.n.01'),garm,1,0,0.0,1.158120,1.652238,11,3,1,...,1.358353e-08,5.083992e-08,6.963715e-09,5.547362e-08,5.370096e-09,5.547362e-08,mess_jacket,0.543115,56.592911,0.582620
777,Synset('long_johns.n.01'),garm,1,0,0.0,1.052836,0.479149,10,1,1,...,1.393969e-08,1.103470e-07,6.987224e-09,1.103470e-07,5.388225e-09,1.103470e-07,long_johns,0.464427,46.442714,0.464427
778,Synset('undies.n.01'),garm,1,0,0.0,1.158120,0.280880,6,1,1,...,3.095668e-08,2.565946e-07,2.025520e-08,3.896998e-07,1.561987e-08,3.896998e-07,undies,0.501280,50.128013,0.501280


### Vector based: SVM

In [18]:
# generate the local dataframe for different domains
local_fruit = w2v_dataset.loc[w2v_dataset['domain_x']=='fruit']
local_tool = w2v_dataset.loc[w2v_dataset['domain_x']=='tool']
local_music = w2v_dataset.loc[w2v_dataset['domain_x']=='music']
local_furniture = w2v_dataset.loc[w2v_dataset['domain_x']=='furn']
local_garments = w2v_dataset.loc[w2v_dataset['domain_x']=='garm']

local_list = [local_fruit, local_tool, local_music, local_furniture, local_garments]

# average results of five domains
def svm_local_model_test(dataset_list, feature, vec, target, cc, gamma_setting):
    local_kappa_list = []
    local_balancedAcc_list = []
    for local_dataset in dataset_list:
        result_list = svm_global_model_test(local_dataset, feature, vec, target, cc, gamma_setting)
        local_kappa_list.append(result_list[0])
        local_balancedAcc_list.append(result_list[1])

    local_result_list = [np.mean(local_kappa_list), np.mean(local_balancedAcc_list)]
    return local_result_list

In [19]:
# grid search for SVM
max_kappa = 0
max_balanced_accuracy = 0
best_C = 0
besr_gamma = ''
for gamma_setting in ['auto', 'scale']:
    for cc in range(1, 201):
        res = svm_local_model_test(local_list, features_base, vec_col+['ngram_100y_max', 'ngram_400y_mean'], target, cc, gamma_setting)
        if res[0] > max_kappa:
            max_kappa = res[0]
            max_balanced_accuracy = res[1]
            best_C = cc
            besr_gamma = gamma_setting

print('Best kappa = {0}, best balanced accuracy = {1}, C = {2}, gamma is {3}'.format(max_kappa, max_balanced_accuracy,best_C, besr_gamma))

Best kappa = 0.560871366280561, best balanced accuracy = 0.7683412698412698, C = 198, gamma is scale


## TransferModel Test

### Benchmark: Random Forest

In [145]:
# generate the unseen(transfer) dataframe for different domains
unseen_fruit = w2v_dataset.loc[w2v_dataset['domain_x']!='fruit']
unseen_tool = w2v_dataset.loc[w2v_dataset['domain_x']!='tool']
unseen_music = w2v_dataset.loc[w2v_dataset['domain_x']!='music']
unseen_furniture = w2v_dataset.loc[w2v_dataset['domain_x']!='furn']
unseen_garments = w2v_dataset.loc[w2v_dataset['domain_x']!='garm']

unseen_list = [unseen_fruit, unseen_tool, unseen_music, unseen_furniture, unseen_garments]

def transfer_model_test(train_list, test_list, feature, distance, target):
    random_seed = 7 # R
    transfer_kappa_list = []
    transfer_balancedAcc_list = []
    for counter in range(len(train_list)):
        training_data = train_list[counter].reset_index()
        testing_data = test_list[counter].reset_index()
        feature_list = feature + distance
        X_train = training_data[feature_list]
        y_train = training_data[target]
        X_test = testing_data[feature_list]
        y_test = testing_data[target]

        # SMOTE algorithm
        smote = SMOTE(random_state=random_seed, k_neighbors=2)
        X_train, y_train = smote.fit_resample(X_train, y_train)

        # define random forest model
        rf = RandomForestClassifier(random_state=random_seed, max_features='sqrt', n_estimators=1400, min_samples_split=2, min_samples_leaf=1, max_depth=50, oob_score=True, criterion='gini', bootstrap=True).fit(X_train, y_train)

        # predict and make score
        pipeline = make_pipeline(smote, rf)
        y_pred = pipeline.predict(X_test)

        kappa = cohen_kappa_score(y_test, y_pred)
        balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

        transfer_kappa_list.append(kappa)
        transfer_balancedAcc_list.append(balanced_accuracy)

    transfer_results = [np.mean(transfer_kappa_list), np.mean(transfer_balancedAcc_list)]

    return transfer_results

In [146]:
transfer_model_test(unseen_list, local_list, features_base, ['dist_std'], target)

[0.48615842665407644, 0.7813716087673764]

In [147]:
transfer_model_test(unseen_list, local_list, features_base, ['dist_mean'], target)

[0.5218030080210475, 0.8039889364200585]

In [148]:
transfer_model_test(unseen_list, local_list, features_base, ['dist_max'], target)

[0.5124772789121373, 0.7942018585070195]

In [149]:
transfer_model_test(unseen_list, local_list, features_base, ['dist_min'], target)

[0.5314563733456981, 0.7914294081380764]

In [49]:
w2v_dataset

Unnamed: 0,Synsets,domain_x,nrdirhypers_x,nrhypos_x,nrpartrels_normalised_x,depthfromtopsynset_normalised_x,glosslength_normalised_x,minwordlength_x,nroflemmas_x,polyscore_max_x,...,ngram_200y_mean,ngram_200y_max,ngram_400y_mean,ngram_400y_max,ngram_500y_mean,ngram_500y_max,norm,dis_min,dist_mean,dist_max
0,Synset('adjustable_wrench.n.01'),tool,1,7,0.0,1.012903,0.563173,17,2,1,...,1.380492e-08,5.067315e-08,6.919673e-09,5.067315e-08,5.336133e-09,5.067315e-08,adjustable_wrench,41.657570,45.038897,48.420227
1,Synset('allen_wrench.n.01'),tool,1,0,0.0,1.012903,0.391092,12,1,1,...,7.715051e-09,6.483798e-08,3.867145e-09,6.483798e-08,2.982164e-09,6.483798e-08,allen_wrench,54.982889,54.982889,54.982889
2,Synset('alligator_wrench.n.01'),tool,1,0,0.0,1.012903,1.517437,16,1,1,...,1.237592e-09,1.499339e-08,6.203393e-10,1.499339e-08,4.783770e-10,1.499339e-08,alligator_wrench,43.102965,43.102965,43.102965
3,Synset('awl.n.01'),tool,1,2,15.7,0.911613,0.985552,3,1,1,...,5.869817e-07,2.510665e-06,6.210211e-07,6.602901e-06,5.069033e-07,6.602901e-06,awl,32.799363,32.799363,32.799363
4,Synset('backsaw.n.01'),tool,1,0,0.0,1.114194,1.110701,7,2,1,...,1.451245e-08,3.507272e-08,9.018908e-09,2.260505e-07,6.954965e-09,2.260505e-07,backsaw,29.045403,32.644600,36.243793
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
775,Synset('ballet_skirt.n.01'),garm,1,0,0.0,0.947552,0.578283,4,2,2,...,1.531570e-07,8.004256e-07,1.136149e-07,8.004256e-07,9.223624e-08,1.686392e-06,ballet_skirt,19.883022,23.115268,26.347515
776,Synset('mess_jacket.n.01'),garm,1,0,0.0,1.158120,1.652238,11,3,1,...,1.358353e-08,5.083992e-08,6.963715e-09,5.547362e-08,5.370096e-09,5.547362e-08,mess_jacket,54.311496,56.592911,58.261991
777,Synset('long_johns.n.01'),garm,1,0,0.0,1.052836,0.479149,10,1,1,...,1.393969e-08,1.103470e-07,6.987224e-09,1.103470e-07,5.388225e-09,1.103470e-07,long_johns,46.442714,46.442714,46.442714
778,Synset('undies.n.01'),garm,1,0,0.0,1.158120,0.280880,6,1,1,...,3.095668e-08,2.565946e-07,2.025520e-08,3.896998e-07,1.561987e-08,3.896998e-07,undies,50.128013,50.128013,50.128013


### Vector based: SVM

In [20]:
# generate the unseen(transfer) dataframe for different domains
unseen_fruit = w2v_dataset.loc[w2v_dataset['domain_x']!='fruit']
unseen_tool = w2v_dataset.loc[w2v_dataset['domain_x']!='tool']
unseen_music = w2v_dataset.loc[w2v_dataset['domain_x']!='music']
unseen_furniture = w2v_dataset.loc[w2v_dataset['domain_x']!='furn']
unseen_garments = w2v_dataset.loc[w2v_dataset['domain_x']!='garm']

unseen_list = [unseen_fruit, unseen_tool, unseen_music, unseen_furniture, unseen_garments]

def svm_transfer_model_test(train_list, test_list, feature, distance, target, cc, gamma_setting):
    random_seed = 7 # R
    transfer_kappa_list = []
    transfer_balancedAcc_list = []
    for counter in range(len(train_list)):
        training_data = train_list[counter].reset_index()
        testing_data = test_list[counter].reset_index()
        feature_list = feature + distance
        X_train = training_data[feature_list]
        y_train = training_data[target]
        X_test = testing_data[feature_list]
        y_test = testing_data[target]

        # SVM
        svm_classifier = svm.SVC(C=cc, kernel='rbf', gamma=gamma_setting)

        # fit data
        svm_classifier.fit(X_train, y_train)

        # predict and make score
        y_pred = svm_classifier.predict(X_test)

        kappa = cohen_kappa_score(y_test, y_pred)
        balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

        transfer_kappa_list.append(kappa)
        transfer_balancedAcc_list.append(balanced_accuracy)

    transfer_results = [np.mean(transfer_kappa_list), np.mean(transfer_balancedAcc_list)]

    return transfer_results

In [21]:
# grid search for SVM
max_kappa = 0
max_balanced_accuracy = 0
best_C = 0
besr_gamma = ''
for gamma_setting in ['auto', 'scale']:
    for cc in range(1, 201):
        res = svm_transfer_model_test(unseen_list, local_list, features_base, vec_col+['ngram_5y_mean', 'ngram_50y_mean', 'ngram_100y_mean', 'ngram_200y_max', 'ngram_500y_max'], target, cc, gamma_setting)
        if res[0] > max_kappa:
            max_kappa = res[0]
            max_balanced_accuracy = res[1]
            best_C = cc
            besr_gamma = gamma_setting

print('Best kappa = {0}, best balanced accuracy = {1}, C = {2}, gamma is {3}'.format(max_kappa, max_balanced_accuracy,best_C, besr_gamma))

Best kappa = 0.40680554548890785, best balanced accuracy = 0.6903773148027202, C = 66, gamma is scale


In [None]:
np.mean([0.862, 0.761, 0.562, 0.5695, 0.4001])