In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, cohen_kappa_score, balanced_accuracy_score
from itertools import combinations

import warnings
warnings.filterwarnings("ignore")

In [2]:
# read the pre-processed data all agreed
data = pd.read_csv('./features_google_ngram.csv', index_col=None)
base_feature = ['nrdirhypers_x',
                'nrhypos_x',
                'nrpartrels_normalised_x',
                'depthfromtopsynset_normalised_x',
                'glosslength_normalised_x',
                'minwordlength_x',
                'nroflemmas_x',
                'polyscore_max_x']

target = ['vote_x']

data

Unnamed: 0,Synsets,domain_x,nrdirhypers_x,nrhypos_x,nrpartrels_normalised_x,depthfromtopsynset_normalised_x,glosslength_normalised_x,minwordlength_x,nroflemmas_x,polyscore_max_x,...,ngram_50y_mean,ngram_50y_max,ngram_100y_mean,ngram_100y_max,ngram_200y_mean,ngram_200y_max,ngram_400y_mean,ngram_400y_max,ngram_500y_mean,ngram_500y_max
0,Synset('adjustable_wrench.n.01'),tool,1,7,0.0,1.012903,0.563173,17,2,1,...,2.667275e-08,5.067315e-08,2.252890e-08,5.067315e-08,1.380492e-08,5.067315e-08,6.919673e-09,5.067315e-08,5.336133e-09,5.067315e-08
1,Synset('allen_wrench.n.01'),tool,1,0,0.0,1.012903,0.391092,12,1,1,...,2.025710e-08,6.483798e-08,1.515313e-08,6.483798e-08,7.715051e-09,6.483798e-08,3.867145e-09,6.483798e-08,2.982164e-09,6.483798e-08
2,Synset('alligator_wrench.n.01'),tool,1,0,0.0,1.012903,1.517437,16,1,1,...,1.955061e-10,5.970550e-10,6.184165e-10,7.804026e-09,1.237592e-09,1.499339e-08,6.203393e-10,1.499339e-08,4.783770e-10,1.499339e-08
3,Synset('awl.n.01'),tool,1,2,15.7,0.911613,0.985552,3,1,1,...,2.895038e-07,3.752697e-07,3.562540e-07,6.002532e-07,5.869817e-07,2.510665e-06,6.210211e-07,6.602901e-06,5.069033e-07,6.602901e-06
4,Synset('backsaw.n.01'),tool,1,0,0.0,1.114194,1.110701,7,2,1,...,1.620242e-08,2.446695e-08,1.704462e-08,2.446695e-08,1.451245e-08,3.507272e-08,9.018908e-09,2.260505e-07,6.954965e-09,2.260505e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
834,Synset('ballet_skirt.n.01'),garm,1,0,0.0,0.947552,0.578283,4,2,2,...,4.059229e-07,8.004256e-07,2.485961e-07,8.004256e-07,1.531570e-07,8.004256e-07,1.136149e-07,8.004256e-07,9.223624e-08,1.686392e-06
835,Synset('mess_jacket.n.01'),garm,1,0,0.0,1.158120,1.652238,11,3,1,...,9.969528e-09,8.272560e-09,1.233276e-08,2.136213e-08,1.358353e-08,5.083992e-08,6.963715e-09,5.547362e-08,5.370096e-09,5.547362e-08
836,Synset('long_johns.n.01'),garm,1,0,0.0,1.052836,0.479149,10,1,1,...,4.974292e-08,1.103470e-07,2.680743e-08,1.103470e-07,1.393969e-08,1.103470e-07,6.987224e-09,1.103470e-07,5.388225e-09,1.103470e-07
837,Synset('undies.n.01'),garm,1,0,0.0,1.158120,0.280880,6,1,1,...,7.986401e-08,2.565946e-07,5.787213e-08,2.565946e-07,3.095668e-08,2.565946e-07,2.025520e-08,3.896998e-07,1.561987e-08,3.896998e-07


## Selection for GlobalModel

In [27]:
candidate_feature_list = ['ngram_1y_mean', 'ngram_1y_max',
                      'ngram_5y_mean', 'ngram_5y_max',
                      'ngram_10y_mean', 'ngram_10y_max',
                      'ngram_20y_mean', 'ngram_20y_max',
                      'ngram_50y_mean', 'ngram_50y_max',
                      'ngram_100y_mean', 'ngram_100y_max',
                      'ngram_200y_mean', 'ngram_200y_max',
                      'ngram_400y_mean', 'ngram_400y_max',
                      'ngram_500y_mean', 'ngram_500y_max']

In [26]:
# split training set and testing set using K-Flod
def new_features_global_model_test(dataset, feature, new_features, target):
    K = 10
    random_seed = 7 # R
    data = dataset.reset_index()
    if new_features is None:
        feature_list = feature
    else:
        feature_list = feature + [new_features]
    X = data[feature_list]
    y = data[target]

    K_Flod = StratifiedKFold(n_splits=K, shuffle=True, random_state=random_seed)
    K_Flod.get_n_splits(X, y)
    cohen_kappa = []
    balanced_acc = []
    for train_index, test_index in K_Flod.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # SMOTE algorithm
        smote = SMOTE(random_state=random_seed, k_neighbors=2)
        X_train, y_train = smote.fit_resample(X_train, y_train)

        # define random forest model
        rf = RandomForestClassifier(random_state=random_seed, max_features='sqrt', n_estimators=1400, min_samples_split=2, min_samples_leaf=1, max_depth=50, oob_score=True, criterion='gini', bootstrap=True).fit(X_train, y_train)

        # predict and make score
        pipeline = make_pipeline(smote, rf)
        y_pred = pipeline.predict(X_test)

        kappa = cohen_kappa_score(y_test, y_pred)
        cohen_kappa.append(kappa)
        balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
        balanced_acc.append(balanced_accuracy)

    result_kappa = np.mean(cohen_kappa)
    results_acc = np.mean(balanced_acc)

    # importance of features
    importance = rf.feature_importances_
    importance = pd.DataFrame([feature_list, importance]).transpose()
    importance = importance.rename(columns={0:'feature', 1:'importance'}).sort_values('importance', ascending=False)

    return result_kappa, importance

### Bottom-up

In [7]:
base_feature_0 = base_feature
candidate_feature_list_0 = candidate_feature_list

best_kappa = 0
best_importance = None

for candidate in candidate_feature_list_0:
    kappa, importance = new_features_global_model_test(data, base_feature_0, candidate, target)
    if kappa > best_kappa:
        best_kappa = kappa
        best_importance = importance
        print(candidate + ': kappa=' + str(best_kappa))
    else:
        print(candidate + ' skipped')

print('Finish!')
best_importance

ngram_1y_mean: kappa=0.6895131395336889
ngram_1y_max skipped
ngram_5y_mean skipped
ngram_5y_max: kappa=0.6974403198307233
ngram_10y_mean skipped
ngram_10y_max skipped
ngram_20y_mean skipped
ngram_20y_max skipped
ngram_50y_mean skipped
ngram_50y_max skipped
ngram_100y_mean skipped
ngram_100y_max: kappa=0.6996241732892688
ngram_200y_mean skipped
ngram_200y_max skipped
ngram_400y_mean skipped
ngram_400y_max skipped
ngram_500y_mean skipped
ngram_500y_max skipped
Finish!


Unnamed: 0,feature,importance
3,depthfromtopsynset_normalised_x,0.41506
4,glosslength_normalised_x,0.158386
5,minwordlength_x,0.118692
8,ngram_100y_max,0.086342
2,nrpartrels_normalised_x,0.076907
1,nrhypos_x,0.05546
7,polyscore_max_x,0.052343
6,nroflemmas_x,0.033962
0,nrdirhypers_x,0.002849


In [16]:
base_feature_1 = base_feature + ['ngram_100y_max']
candidate_feature_list_1 = candidate_feature_list
candidate_feature_list_1.remove('ngram_100y_max')


best_kappa = 0.69962
best_importance = None

for candidate in candidate_feature_list_1:
    kappa, importance = new_features_global_model_test(data, base_feature_1, candidate, target)
    if kappa > best_kappa:
        best_kappa = kappa
        best_importance = importance
        print(candidate + ': kappa=' + str(best_kappa))
    else:
        print(candidate + ' skipped')

print('Finish!')
best_importance

ngram_1y_mean skipped
ngram_1y_max skipped
ngram_5y_mean: kappa=0.6997501816644983
ngram_5y_max skipped
ngram_10y_mean skipped
ngram_10y_max skipped
ngram_20y_mean skipped
ngram_20y_max skipped
ngram_50y_mean skipped
ngram_50y_max skipped
ngram_100y_mean skipped
ngram_200y_mean skipped
ngram_200y_max: kappa=0.7020794893980835
ngram_400y_mean skipped
ngram_400y_max skipped
ngram_500y_mean skipped
ngram_500y_max skipped
Finish!


Unnamed: 0,feature,importance
3,depthfromtopsynset_normalised_x,0.390578
4,glosslength_normalised_x,0.152431
5,minwordlength_x,0.113088
2,nrpartrels_normalised_x,0.078705
8,ngram_100y_max,0.067032
9,ngram_200y_max,0.066912
1,nrhypos_x,0.050239
7,polyscore_max_x,0.047436
6,nroflemmas_x,0.030826
0,nrdirhypers_x,0.002753


In [34]:
base_feature_2 = base_feature + ['ngram_100y_max', 'ngram_200y_max']
candidate_feature_list_2 = candidate_feature_list_1
candidate_feature_list_2.remove('ngram_200y_max')

best_kappa = 0.70208
best_importance = None

for candidate in candidate_feature_list_2:
    kappa, importance = new_features_global_model_test(data, base_feature_2, candidate, target)
    if kappa > best_kappa:
        best_kappa = kappa
        best_importance = importance
        print(candidate + ': kappa=' + str(best_kappa))
    else:
        print(candidate + ' skipped')

print('Finish!')
best_importance

ngram_1y_mean skipped
ngram_1y_max skipped
ngram_5y_mean skipped
ngram_5y_max skipped
ngram_10y_mean skipped
ngram_10y_max skipped
ngram_20y_mean skipped
ngram_20y_max skipped
ngram_50y_mean skipped
ngram_50y_max skipped
ngram_100y_mean skipped
ngram_200y_mean skipped
ngram_400y_mean skipped
ngram_400y_max skipped
ngram_500y_mean skipped
ngram_500y_max skipped
Finish!


Bottom-up best features: ['ngram_100y_max', 'ngram_200y_max']
Best kappa = 0.70208

### Top-down

In [28]:
candidate_feature_list = ['ngram_1y_mean', 'ngram_1y_max',
                          'ngram_5y_mean', 'ngram_5y_max',
                          'ngram_10y_mean', 'ngram_10y_max',
                          'ngram_20y_mean', 'ngram_20y_max',
                          'ngram_50y_mean', 'ngram_50y_max',
                          'ngram_100y_mean', 'ngram_100y_max',
                          'ngram_200y_mean', 'ngram_200y_max',
                          'ngram_400y_mean', 'ngram_400y_max',
                          'ngram_500y_mean', 'ngram_500y_max']

In [29]:
training_features_0 = base_feature + candidate_feature_list

best_kappa, importance = new_features_global_model_test(data, training_features_0, None, target)
print('kappa=' + str(best_kappa))
importance

kappa=0.6790285516625253


Unnamed: 0,feature,importance
3,depthfromtopsynset_normalised_x,0.323682
4,glosslength_normalised_x,0.122123
5,minwordlength_x,0.085529
2,nrpartrels_normalised_x,0.066154
25,ngram_500y_max,0.050918
23,ngram_400y_max,0.031575
7,polyscore_max_x,0.03134
1,nrhypos_x,0.031194
19,ngram_100y_max,0.023715
21,ngram_200y_max,0.022607


In [82]:
candidate_feature_list_1 = candidate_feature_list

best_kappa = 0.67903
best_importance = None
for idx in range(len(candidate_feature_list_1)):
    input_candidates = candidate_feature_list_1[0:idx] + candidate_feature_list_1[idx+1:]
    training_features_1 = base_feature + input_candidates
    kappa, importance = new_features_global_model_test(data, training_features_1, None, target)
    if kappa > best_kappa:
        best_kappa = kappa
        best_importance = importance
        print('eliminate ' + candidate_feature_list_1[idx] + ': kappa=' + str(best_kappa))
    else:
        print('eliminate ' + candidate_feature_list_1[idx] + ' skipped')

print('Finish!')
best_importance

eliminate ngram_1y_mean: kappa=0.6886693939459642
eliminate ngram_1y_max skipped
eliminate ngram_5y_mean skipped
eliminate ngram_5y_max skipped
eliminate ngram_10y_mean: kappa=0.6907976036343442
eliminate ngram_10y_max skipped
eliminate ngram_20y_mean: kappa=0.6910372700097922
eliminate ngram_20y_max: kappa=0.6921216413820298
eliminate ngram_50y_mean skipped
eliminate ngram_50y_max skipped
eliminate ngram_100y_mean skipped
eliminate ngram_100y_max skipped
eliminate ngram_200y_mean: kappa=0.6992525027107436
eliminate ngram_200y_max skipped
eliminate ngram_400y_mean skipped
eliminate ngram_400y_max skipped
eliminate ngram_500y_mean skipped
eliminate ngram_500y_max skipped


Unnamed: 0,feature,importance
3,depthfromtopsynset_normalised_x,0.322882
4,glosslength_normalised_x,0.125023
5,minwordlength_x,0.085909
2,nrpartrels_normalised_x,0.069496
24,ngram_500y_max,0.049746
1,nrhypos_x,0.031885
22,ngram_400y_max,0.031766
7,polyscore_max_x,0.029908
19,ngram_100y_max,0.0239
20,ngram_200y_max,0.02339


In [84]:
candidate_feature_list_1.remove('ngram_200y_mean')
candidate_feature_list_2 = candidate_feature_list_1

best_kappa = 0.69925
best_importance = None

for idx in range(len(candidate_feature_list_2)):
    input_candidates = candidate_feature_list_2[0:idx] + candidate_feature_list_2[idx+1:]
    training_features_1 = base_feature + input_candidates
    kappa, importance = new_features_global_model_test(data, training_features_1, None, target)
    if kappa > best_kappa:
        best_kappa = kappa
        best_importance = importance
        print('eliminate ' + candidate_feature_list_2[idx] + ': kappa=' + str(best_kappa))
    else:
        print('eliminate ' + candidate_feature_list_2[idx] + ' skipped')

print('Finish!')
best_importance

eliminate ngram_1y_mean skipped
eliminate ngram_1y_max skipped
eliminate ngram_5y_mean skipped
eliminate ngram_5y_max skipped
eliminate ngram_10y_mean skipped
eliminate ngram_10y_max skipped
eliminate ngram_20y_mean skipped
eliminate ngram_20y_max skipped
eliminate ngram_50y_mean skipped
eliminate ngram_50y_max skipped
eliminate ngram_100y_mean skipped
eliminate ngram_100y_max skipped
eliminate ngram_200y_max skipped
eliminate ngram_400y_mean skipped
eliminate ngram_400y_max skipped
eliminate ngram_500y_mean skipped
eliminate ngram_500y_max skipped


### Grid search

run in Python script

In [None]:
grid_base = base_feature
grid_candidate = candidate_feature_list

best_kappa = 0
best_importance = None

print('Start...')
for candidate_num in range(len(grid_candidate)):
    candidate_num += 1
    candidate_sets = list(combinations(candidate_feature_list, candidate_num))
    for candidate_group in candidate_sets:
        candidate_group = list(candidate_group)
        training_features = grid_base + candidate_group
        kappa, importance = new_features_global_model_test(data, training_features, None, target)
        if kappa > best_kappa:
            best_kappa = kappa
            best_importance = importance
            string = 'Features: ' + str(candidate_group) + ' kappa=' + str(best_kappa) + '\n'
            with open('./grid_search_GlobalModel.txt', 'a+') as f:
                f.write(string)
            print(string)
with open('./grid_search_GlobalModel.txt', 'a+') as f:
        f.write(importance)
print('Finish!')

## Selection for LocalModel

In [3]:
candidate_feature_list = ['ngram_1y_mean', 'ngram_1y_max',
                          'ngram_5y_mean', 'ngram_5y_max',
                          'ngram_10y_mean', 'ngram_10y_max',
                          'ngram_20y_mean', 'ngram_20y_max',
                          'ngram_50y_mean', 'ngram_50y_max',
                          'ngram_100y_mean', 'ngram_100y_max',
                          'ngram_200y_mean', 'ngram_200y_max',
                          'ngram_400y_mean', 'ngram_400y_max',
                          'ngram_500y_mean', 'ngram_500y_max']

In [4]:
# generate the local dataframe for different domains
local_fruit = data.loc[data['domain_x']=='fruit']
local_tool = data.loc[data['domain_x']=='tool']
local_music = data.loc[data['domain_x']=='music']
local_furniture = data.loc[data['domain_x']=='furn']
local_garments = data.loc[data['domain_x']=='garm']

local_list = [local_fruit, local_tool, local_music, local_furniture, local_garments]

def new_feature_local_model_test(dataset_list, base_feature, new_feature, target):
    random_seed = 7 # R
    K = 10
    if new_feature is None:
        feature_list = base_feature
    else:
        feature_list = base_feature + [new_feature]

    local_kappa_list = []
    # local_balancedAcc_list = []
    for dataset in dataset_list:
        X = dataset[feature_list]
        y = dataset[target]
        K_Flod = StratifiedKFold(n_splits=K, shuffle=True, random_state=random_seed)
        K_Flod.get_n_splits(X, y)
        cohen_kappa = []
        balanced_acc = []
        for train_index, test_index in K_Flod.split(X, y):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # SMOTE algorithm
            smote = SMOTE(random_state=random_seed, k_neighbors=2)
            X_train, y_train = smote.fit_resample(X_train, y_train)

            # define random forest model
            rf = RandomForestClassifier(random_state=random_seed, max_features='sqrt', n_estimators=1400, min_samples_split=2, min_samples_leaf=1, max_depth=50, oob_score=True, criterion='gini', bootstrap=True).fit(X_train, y_train)

            # predict and make score
            pipeline = make_pipeline(smote, rf)
            y_pred = pipeline.predict(X_test)

            kappa = cohen_kappa_score(y_test, y_pred)
            cohen_kappa.append(kappa)
            balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
            balanced_acc.append(balanced_accuracy)

        cohen_kappa_mean = np.mean(cohen_kappa)
        # balanced_acc_mean = np.mean(balanced_acc)
        local_kappa_list.append(cohen_kappa_mean)
        # local_balancedAcc_list.append(balanced_acc_mean)

    local_kappa = np.mean(local_kappa_list)
    # local_acc = np.mean(local_balancedAcc_list)

    # importance of features
    importance = rf.feature_importances_
    importance = pd.DataFrame([feature_list, importance]).transpose()
    importance = importance.rename(columns={0:'feature', 1:'importance'}).sort_values('importance', ascending=False)

    return local_kappa, importance

### Bottom-up

In [7]:
base_feature_0 = base_feature
candidate_feature_list_0 = candidate_feature_list

best_kappa = 0
best_importance = None

for candidate in candidate_feature_list_0:
    kappa, importance = new_feature_local_model_test(local_list, base_feature_0, candidate, target)
    if kappa > best_kappa:
        best_kappa = kappa
        best_importance = importance
        print(candidate + ': kappa=' + str(best_kappa))
    else:
        print(candidate + ' skipped')

print('Finish!')
best_importance

ngram_1y_mean: kappa=0.6567146798792984
ngram_1y_max skipped
ngram_5y_mean: kappa=0.6724061180707366
ngram_5y_max skipped
ngram_10y_mean skipped
ngram_10y_max skipped
ngram_20y_mean: kappa=0.6798979095646819
ngram_20y_max skipped
ngram_50y_mean skipped
ngram_50y_max skipped
ngram_100y_mean skipped
ngram_100y_max skipped
ngram_200y_mean skipped
ngram_200y_max skipped
ngram_400y_mean skipped
ngram_400y_max skipped
ngram_500y_mean skipped
ngram_500y_max skipped
Finish!


Unnamed: 0,feature,importance
3,depthfromtopsynset_normalised_x,0.273267
5,minwordlength_x,0.185142
4,glosslength_normalised_x,0.17422
1,nrhypos_x,0.171676
8,ngram_20y_mean,0.092765
7,polyscore_max_x,0.058391
6,nroflemmas_x,0.030873
2,nrpartrels_normalised_x,0.011504
0,nrdirhypers_x,0.002162


In [8]:
base_feature_1 = base_feature + ['ngram_20y_mean']
candidate_feature_list_0.remove('ngram_20y_mean')
candidate_feature_list_1 = candidate_feature_list_0

best_kappa = 0.67990
best_importance = None

for candidate in candidate_feature_list_1:
    kappa, importance = new_feature_local_model_test(local_list, base_feature_1, candidate, target)
    if kappa > best_kappa:
        best_kappa = kappa
        best_importance = importance
        print(candidate + ': kappa=' + str(best_kappa))
    else:
        print(candidate + ' skipped')

print('Finish!')
best_importance

ngram_1y_mean skipped
ngram_1y_max skipped
ngram_5y_mean skipped
ngram_5y_max: kappa=0.6847148871531882
ngram_10y_mean skipped
ngram_10y_max skipped
ngram_20y_max skipped
ngram_50y_mean skipped
ngram_50y_max skipped
ngram_100y_mean skipped
ngram_100y_max: kappa=0.6893742970279654
ngram_200y_mean skipped
ngram_200y_max skipped
ngram_400y_mean skipped
ngram_400y_max skipped
ngram_500y_mean skipped
ngram_500y_max skipped
Finish!


Unnamed: 0,feature,importance
3,depthfromtopsynset_normalised_x,0.257575
5,minwordlength_x,0.170382
1,nrhypos_x,0.164066
4,glosslength_normalised_x,0.160209
8,ngram_20y_mean,0.076773
9,ngram_100y_max,0.076011
7,polyscore_max_x,0.053664
6,nroflemmas_x,0.029404
2,nrpartrels_normalised_x,0.010025
0,nrdirhypers_x,0.001891


In [10]:
base_feature_2 = base_feature + ['ngram_100y_max']
candidate_feature_list_1.remove('ngram_100y_max')
candidate_feature_list_2 = candidate_feature_list_1

best_kappa = 0.68937
best_importance = None

for candidate in candidate_feature_list_2:
    kappa, importance = new_feature_local_model_test(local_list, base_feature_2, candidate, target)
    if kappa > best_kappa:
        best_kappa = kappa
        best_importance = importance
        print(candidate + ': kappa=' + str(best_kappa))
    else:
        print(candidate + ' skipped')

print('Finish!')
best_importance

ngram_1y_mean skipped
ngram_1y_max skipped
ngram_5y_mean skipped
ngram_5y_max skipped
ngram_10y_mean skipped
ngram_10y_max skipped
ngram_20y_max skipped
ngram_50y_mean skipped
ngram_50y_max skipped
ngram_100y_mean skipped
ngram_200y_mean skipped
ngram_200y_max: kappa=0.700161288230784
ngram_400y_mean: kappa=0.7115015229441634
ngram_400y_max skipped
ngram_500y_mean skipped
ngram_500y_max skipped
Finish!


Unnamed: 0,feature,importance
3,depthfromtopsynset_normalised_x,0.264289
5,minwordlength_x,0.173495
4,glosslength_normalised_x,0.166164
1,nrhypos_x,0.165204
8,ngram_100y_max,0.088688
7,polyscore_max_x,0.052278
9,ngram_400y_mean,0.04684
6,nroflemmas_x,0.029758
2,nrpartrels_normalised_x,0.010771
0,nrdirhypers_x,0.002513


In [11]:
base_feature_3 = base_feature + ['ngram_400y_mean']
candidate_feature_list_2.remove('ngram_400y_mean')
candidate_feature_list_3 = candidate_feature_list_2

best_kappa = 0.71150
best_importance = None

for candidate in candidate_feature_list_3:
    kappa, importance = new_feature_local_model_test(local_list, base_feature_3, candidate, target)
    if kappa > best_kappa:
        best_kappa = kappa
        best_importance = importance
        print(candidate + ': kappa=' + str(best_kappa))
    else:
        print(candidate + ' skipped')

print('Finish!')
best_importance

ngram_1y_mean skipped
ngram_1y_max skipped
ngram_5y_mean skipped
ngram_5y_max skipped
ngram_10y_mean skipped
ngram_10y_max skipped
ngram_20y_max skipped
ngram_50y_mean skipped
ngram_50y_max skipped
ngram_100y_mean skipped
ngram_200y_mean skipped
ngram_200y_max skipped
ngram_400y_max skipped
ngram_500y_mean skipped
ngram_500y_max skipped
Finish!


### Top-down

In [12]:
candidate_feature_list = ['ngram_1y_mean', 'ngram_1y_max',
                          'ngram_5y_mean', 'ngram_5y_max',
                          'ngram_10y_mean', 'ngram_10y_max',
                          'ngram_20y_mean', 'ngram_20y_max',
                          'ngram_50y_mean', 'ngram_50y_max',
                          'ngram_100y_mean', 'ngram_100y_max',
                          'ngram_200y_mean', 'ngram_200y_max',
                          'ngram_400y_mean', 'ngram_400y_max',
                          'ngram_500y_mean', 'ngram_500y_max']

In [13]:
training_features_0 = base_feature + candidate_feature_list

best_kappa, importance = new_feature_local_model_test(local_list, training_features_0, None, target)
print('kappa=' + str(best_kappa))
importance

kappa=0.6598485207395978


Unnamed: 0,feature,importance
3,depthfromtopsynset_normalised_x,0.191351
5,minwordlength_x,0.122385
1,nrhypos_x,0.114978
4,glosslength_normalised_x,0.113005
8,ngram_1y_mean,0.041377
7,polyscore_max_x,0.034202
10,ngram_5y_mean,0.033819
23,ngram_400y_max,0.03004
25,ngram_500y_max,0.029216
12,ngram_10y_mean,0.02779


In [14]:
candidate_feature_list_1 = candidate_feature_list

best_kappa = 0.65985
best_importance = None
for idx in range(len(candidate_feature_list_1)):
    input_candidates = candidate_feature_list_1[0:idx] + candidate_feature_list_1[idx+1:]
    training_features_1 = base_feature + input_candidates
    kappa, importance = new_feature_local_model_test(local_list, training_features_1, None, target)
    if kappa > best_kappa:
        best_kappa = kappa
        best_importance = importance
        print('eliminate ' + candidate_feature_list_1[idx] + ': kappa=' + str(best_kappa))
    else:
        print('eliminate ' + candidate_feature_list_1[idx] + ' skipped')

print('Finish!')
best_importance

eliminate ngram_1y_mean skipped
eliminate ngram_1y_max: kappa=0.6688014159106257
eliminate ngram_5y_mean skipped
eliminate ngram_5y_max skipped
eliminate ngram_10y_mean skipped
eliminate ngram_10y_max skipped
eliminate ngram_20y_mean skipped
eliminate ngram_20y_max skipped
eliminate ngram_50y_mean skipped
eliminate ngram_50y_max skipped
eliminate ngram_100y_mean skipped
eliminate ngram_100y_max skipped
eliminate ngram_200y_mean skipped
eliminate ngram_200y_max: kappa=0.6707992748317102
eliminate ngram_400y_mean skipped
eliminate ngram_400y_max skipped
eliminate ngram_500y_mean skipped
eliminate ngram_500y_max skipped
Finish!


Unnamed: 0,feature,importance
3,depthfromtopsynset_normalised_x,0.187932
5,minwordlength_x,0.120714
4,glosslength_normalised_x,0.114982
1,nrhypos_x,0.107681
8,ngram_1y_mean,0.042818
10,ngram_5y_mean,0.036929
7,polyscore_max_x,0.03499
12,ngram_10y_mean,0.032827
24,ngram_500y_max,0.030779
22,ngram_400y_max,0.030221


In [None]:
candidate_feature_list_1.remove('ngram_200y_max')
candidate_feature_list_2 = candidate_feature_list_1

best_kappa = 0.67080
best_importance = None
for idx in range(len(candidate_feature_list_2)):
    input_candidates = candidate_feature_list_2[0:idx] + candidate_feature_list_2[idx+1:]
    training_features_2 = base_feature + input_candidates
    kappa, importance = new_feature_local_model_test(local_list, training_features_2, None, target)
    if kappa > best_kappa:
        best_kappa = kappa
        best_importance = importance
        print('eliminate ' + candidate_feature_list_2[idx] + ': kappa=' + str(best_kappa))
    else:
        print('eliminate ' + candidate_feature_list_2[idx] + ' skipped')

print('Finish!')
best_importance

eliminate ngram_1y_mean skipped
eliminate ngram_1y_max: kappa=0.6747325849074625
eliminate ngram_5y_mean skipped
eliminate ngram_5y_max: kappa=0.6772453047532967
eliminate ngram_10y_mean: kappa=0.6777361050689039
eliminate ngram_10y_max: kappa=0.6829072095152015


In [16]:
candidate_feature_list_2.remove('ngram_10y_max')
candidate_feature_list_3 = candidate_feature_list_2

best_kappa = 0.68291
best_importance = None
for idx in range(len(candidate_feature_list_3)):
    input_candidates = candidate_feature_list_3[0:idx] + candidate_feature_list_3[idx+1:]
    training_features_3 = base_feature + input_candidates
    kappa, importance = new_feature_local_model_test(local_list, training_features_3, None, target)
    if kappa > best_kappa:
        best_kappa = kappa
        best_importance = importance
        print('eliminate ' + candidate_feature_list_3[idx] + ': kappa=' + str(best_kappa))
    else:
        print('eliminate ' + candidate_feature_list_3[idx] + ' skipped')

print('Finish!')
best_importance

eliminate ngram_1y_mean skipped
eliminate ngram_1y_max skipped
eliminate ngram_5y_mean skipped
eliminate ngram_5y_max skipped
eliminate ngram_10y_mean skipped
eliminate ngram_20y_mean skipped
eliminate ngram_20y_max skipped
eliminate ngram_50y_mean skipped
eliminate ngram_50y_max skipped
eliminate ngram_100y_mean: kappa=0.6841312481897445
eliminate ngram_100y_max skipped
eliminate ngram_200y_mean skipped
eliminate ngram_400y_mean skipped
eliminate ngram_400y_max skipped
eliminate ngram_500y_mean skipped
eliminate ngram_500y_max skipped
Finish!


Unnamed: 0,feature,importance
3,depthfromtopsynset_normalised_x,0.181616
4,glosslength_normalised_x,0.117104
5,minwordlength_x,0.113387
1,nrhypos_x,0.111377
10,ngram_5y_mean,0.044512
8,ngram_1y_mean,0.042664
7,polyscore_max_x,0.036716
22,ngram_500y_max,0.034169
12,ngram_10y_mean,0.033796
20,ngram_400y_max,0.033067


In [None]:
candidate_feature_list_3.remove('ngram_100y_mean')
candidate_feature_list_4 = candidate_feature_list_3

best_kappa = 0.68413
best_importance = None
for idx in range(len(candidate_feature_list_4)):
    input_candidates = candidate_feature_list_4[0:idx] + candidate_feature_list_4[idx+1:]
    training_features_4 = base_feature + input_candidates
    kappa, importance = new_feature_local_model_test(local_list, training_features_4, None, target)
    if kappa > best_kappa:
        best_kappa = kappa
        best_importance = importance
        print('eliminate ' + candidate_feature_list_4[idx] + ': kappa=' + str(best_kappa))
    else:
        print('eliminate ' + candidate_feature_list_4[idx] + ' skipped')

print('Finish!')
best_importance

eliminate ngram_1y_mean skipped
eliminate ngram_1y_max skipped
eliminate ngram_5y_mean skipped
eliminate ngram_5y_max skipped
eliminate ngram_10y_mean: kappa=0.68450636706036
eliminate ngram_20y_mean skipped
eliminate ngram_20y_max skipped
eliminate ngram_50y_mean skipped
eliminate ngram_50y_max skipped
eliminate ngram_100y_max skipped
eliminate ngram_200y_mean skipped
eliminate ngram_400y_mean skipped


In [19]:
candidate_feature_list_4.remove('ngram_10y_mean')
candidate_feature_list_5 = candidate_feature_list_4

best_kappa = 0.68451
best_importance = None
for idx in range(len(candidate_feature_list_5)):
    input_candidates = candidate_feature_list_5[0:idx] + candidate_feature_list_5[idx+1:]
    training_features_5 = base_feature + input_candidates
    kappa, importance = new_feature_local_model_test(local_list, training_features_5, None, target)
    if kappa > best_kappa:
        best_kappa = kappa
        best_importance = importance
        print('eliminate ' + candidate_feature_list_5[idx] + ': kappa=' + str(best_kappa))
    else:
        print('eliminate ' + candidate_feature_list_5[idx] + ' skipped')

print('Finish!')
best_importance

eliminate ngram_1y_mean skipped
eliminate ngram_1y_max skipped
eliminate ngram_5y_mean skipped
eliminate ngram_5y_max skipped
eliminate ngram_20y_mean skipped
eliminate ngram_20y_max skipped
eliminate ngram_50y_mean skipped
eliminate ngram_50y_max skipped
eliminate ngram_100y_max skipped
eliminate ngram_200y_mean skipped
eliminate ngram_400y_mean skipped
eliminate ngram_400y_max skipped
eliminate ngram_500y_mean skipped
eliminate ngram_500y_max skipped
Finish!


In [20]:
training_features_5 = base_feature + candidate_feature_list_5
kappa, importance = new_feature_local_model_test(local_list, training_features_5, None, target)
kappa, importance

In [22]:
importance

Unnamed: 0,feature,importance
3,depthfromtopsynset_normalised_x,0.184621
5,minwordlength_x,0.12032
4,glosslength_normalised_x,0.116253
1,nrhypos_x,0.107713
8,ngram_1y_mean,0.050102
10,ngram_5y_mean,0.042889
11,ngram_5y_max,0.038203
7,polyscore_max_x,0.037909
19,ngram_400y_max,0.035455
21,ngram_500y_max,0.034278


## Selection for TransferModel

In [3]:
candidate_feature_list = ['ngram_1y_mean', 'ngram_1y_max',
                          'ngram_5y_mean', 'ngram_5y_max',
                          'ngram_10y_mean', 'ngram_10y_max',
                          'ngram_20y_mean', 'ngram_20y_max',
                          'ngram_50y_mean', 'ngram_50y_max',
                          'ngram_100y_mean', 'ngram_100y_max',
                          'ngram_200y_mean', 'ngram_200y_max',
                          'ngram_400y_mean', 'ngram_400y_max',
                          'ngram_500y_mean', 'ngram_500y_max']

In [20]:
# generate the local dataframe for different domains
local_fruit = data.loc[data['domain_x']=='fruit']
local_tool = data.loc[data['domain_x']=='tool']
local_music = data.loc[data['domain_x']=='music']
local_furniture = data.loc[data['domain_x']=='furn']
local_garments = data.loc[data['domain_x']=='garm']

local_list = [local_fruit, local_tool, local_music, local_furniture, local_garments]

# generate the unseen(transfer) dataframe for different domains
unseen_fruit = data.loc[data['domain_x']!='fruit']
unseen_tool = data.loc[data['domain_x']!='tool']
unseen_music = data.loc[data['domain_x']!='music']
unseen_furniture = data.loc[data['domain_x']!='furn']
unseen_garments = data.loc[data['domain_x']!='garm']

unseen_list = [unseen_fruit, unseen_tool, unseen_music, unseen_furniture, unseen_garments]

def new_feature_transfer_model_test(train_list, test_list, base_feature, new_feature, target):
    random_seed = 7 # R
    transfer_kappa_list = []
    # transfer_balancedAcc_list = []
    for counter in range(len(train_list)):
        training_data = train_list[counter].reset_index()
        testing_data = test_list[counter].reset_index()
        if new_feature is None:
            feature_list = base_feature
        else:
            feature_list = base_feature + [new_feature]
        X_train = training_data[feature_list]
        y_train = training_data[target]
        X_test = testing_data[feature_list]
        y_test = testing_data[target]

        # SMOTE algorithm
        smote = SMOTE(random_state=random_seed, k_neighbors=2)
        X_train, y_train = smote.fit_resample(X_train, y_train)

        # define random forest model
        rf = RandomForestClassifier(random_state=random_seed, max_features='sqrt', n_estimators=1400, min_samples_split=2, min_samples_leaf=1, max_depth=50, oob_score=True, criterion='gini', bootstrap=True).fit(X_train, y_train)

        # predict and make score
        pipeline = make_pipeline(smote, rf)
        y_pred = pipeline.predict(X_test)

        kappa = cohen_kappa_score(y_test, y_pred)
        # balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

        transfer_kappa_list.append(kappa)
        # transfer_balancedAcc_list.append(balanced_accuracy)

    transfer_kappa = np.mean(transfer_kappa_list) #, np.mean(transfer_balancedAcc_list)

    # importance of features
    importance = rf.feature_importances_
    importance = pd.DataFrame([feature_list, importance]).transpose()
    importance = importance.rename(columns={0:'feature', 1:'importance'}).sort_values('importance', ascending=False)

    return transfer_kappa, importance

### Bottom-up

In [9]:
base_feature_0 = base_feature
candidate_feature_list_0 = candidate_feature_list

best_kappa = 0
best_importance = None

for candidate in candidate_feature_list_0:
    kappa, importance = new_feature_transfer_model_test(unseen_list, local_list, base_feature_0, candidate, target)
    if kappa > best_kappa:
        best_kappa = kappa
        best_importance = importance
        print(candidate + ': kappa=' + str(best_kappa))
    else:
        print(candidate + ' skipped')

print('Finish!')
best_importance

ngram_1y_mean: kappa=0.517128430230662
ngram_1y_max: kappa=0.5209915138998674
ngram_5y_mean skipped
ngram_5y_max skipped
ngram_10y_mean skipped
ngram_10y_max skipped
ngram_20y_mean: kappa=0.5291173660047429
ngram_20y_max skipped
ngram_50y_mean skipped
ngram_50y_max skipped
ngram_100y_mean skipped
ngram_100y_max skipped
ngram_200y_mean skipped
ngram_200y_max skipped
ngram_400y_mean skipped
ngram_400y_max skipped
ngram_500y_mean skipped
ngram_500y_max skipped
Finish!


Unnamed: 0,feature,importance
3,depthfromtopsynset_normalised_x,0.430008
4,glosslength_normalised_x,0.163903
5,minwordlength_x,0.139732
8,ngram_20y_mean,0.073956
2,nrpartrels_normalised_x,0.069051
7,polyscore_max_x,0.052717
1,nrhypos_x,0.040303
6,nroflemmas_x,0.02767
0,nrdirhypers_x,0.002661


In [10]:
base_feature_1 = base_feature + ['ngram_20y_mean']
candidate_feature_list_0.remove('ngram_20y_mean')
candidate_feature_list_1 = candidate_feature_list_0

best_kappa = 0.52912
best_importance = None

for candidate in candidate_feature_list_1:
    kappa, importance = new_feature_transfer_model_test(unseen_list, local_list, base_feature_1, candidate, target)
    if kappa > best_kappa:
        best_kappa = kappa
        best_importance = importance
        print(candidate + ': kappa=' + str(best_kappa))
    else:
        print(candidate + ' skipped')

print('Finish!')
best_importance

ngram_1y_mean: kappa=0.5326302374414598
ngram_1y_max skipped
ngram_5y_mean skipped
ngram_5y_max: kappa=0.5447861909550551
ngram_10y_mean skipped
ngram_10y_max skipped
ngram_20y_max skipped
ngram_50y_mean skipped
ngram_50y_max skipped
ngram_100y_mean skipped
ngram_100y_max skipped
ngram_200y_mean skipped
ngram_200y_max skipped
ngram_400y_mean skipped
ngram_400y_max: kappa=0.5472816259425761
ngram_500y_mean skipped
ngram_500y_max: kappa=0.5550029583470298
Finish!


Unnamed: 0,feature,importance
3,depthfromtopsynset_normalised_x,0.389121
4,glosslength_normalised_x,0.152968
5,minwordlength_x,0.129752
9,ngram_500y_max,0.0899
2,nrpartrels_normalised_x,0.067162
8,ngram_20y_mean,0.05662
7,polyscore_max_x,0.047386
1,nrhypos_x,0.037951
6,nroflemmas_x,0.02669
0,nrdirhypers_x,0.002449


In [16]:
base_feature_2 = base_feature + ['ngram_20y_mean', 'ngram_500y_max']
candidate_feature_list_1.remove('ngram_500y_max')
candidate_feature_list_2 = candidate_feature_list_1

best_kappa = 0.55500
best_importance = None

for candidate in candidate_feature_list_2:
    kappa, importance = new_feature_transfer_model_test(unseen_list, local_list, base_feature_2, candidate, target)
    if kappa > best_kappa:
        best_kappa = kappa
        best_importance = importance
        print(candidate + ': kappa=' + str(best_kappa))
    else:
        print(candidate + ' skipped')

print('Finish!')
best_importance

ngram_1y_mean: kappa=0.5649742509642828
ngram_1y_max: kappa=0.5669489134087916
ngram_5y_mean: kappa=0.568112495259719
ngram_5y_max skipped
ngram_10y_mean: kappa=0.5711109974680693
ngram_10y_max skipped
ngram_20y_max skipped
ngram_50y_mean: kappa=0.5724325009051144
ngram_50y_max skipped
ngram_100y_mean skipped
ngram_100y_max skipped
ngram_200y_mean skipped
ngram_200y_max skipped
ngram_400y_mean skipped
ngram_400y_max skipped
ngram_500y_mean skipped
Finish!


Unnamed: 0,feature,importance
3,depthfromtopsynset_normalised_x,0.372388
4,glosslength_normalised_x,0.150071
5,minwordlength_x,0.124423
9,ngram_500y_max,0.086107
2,nrpartrels_normalised_x,0.067835
10,ngram_50y_mean,0.047381
7,polyscore_max_x,0.044868
8,ngram_20y_mean,0.044585
1,nrhypos_x,0.034223
6,nroflemmas_x,0.025774


In [17]:
base_feature_3 = base_feature + ['ngram_20y_mean', 'ngram_500y_max', 'ngram_50y_mean']
candidate_feature_list_2.remove('ngram_50y_mean')
candidate_feature_list_3 = candidate_feature_list_2

best_kappa = 0.57243
best_importance = None

for candidate in candidate_feature_list_3:
    kappa, importance = new_feature_transfer_model_test(unseen_list, local_list, base_feature_3, candidate, target)
    if kappa > best_kappa:
        best_kappa = kappa
        best_importance = importance
        print(candidate + ': kappa=' + str(best_kappa))
    else:
        print(candidate + ' skipped')

print('Finish!')
best_importance

ngram_1y_mean: kappa=0.575006597466071
ngram_1y_max skipped
ngram_5y_mean skipped
ngram_5y_max skipped
ngram_10y_mean skipped
ngram_10y_max: kappa=0.5778146359717877
ngram_20y_max skipped
ngram_50y_max skipped
ngram_100y_mean: kappa=0.5882386158426376
ngram_100y_max skipped
ngram_200y_mean skipped
ngram_200y_max skipped
ngram_400y_mean skipped
ngram_400y_max skipped
ngram_500y_mean skipped
Finish!


Unnamed: 0,feature,importance
3,depthfromtopsynset_normalised_x,0.358703
4,glosslength_normalised_x,0.148257
5,minwordlength_x,0.120077
9,ngram_500y_max,0.084109
2,nrpartrels_normalised_x,0.067515
7,polyscore_max_x,0.043433
11,ngram_100y_mean,0.039635
10,ngram_50y_mean,0.039492
8,ngram_20y_mean,0.038272
1,nrhypos_x,0.03214


In [18]:
base_feature_4 = base_feature + ['ngram_20y_mean', 'ngram_500y_max', 'ngram_50y_mean', 'ngram_100y_mean']
candidate_feature_list_3.remove('ngram_100y_mean')
candidate_feature_list_4 = candidate_feature_list_3

best_kappa = 0.58824
best_importance = None

for candidate in candidate_feature_list_4:
    kappa, importance = new_feature_transfer_model_test(unseen_list, local_list, base_feature_4, candidate, target)
    if kappa > best_kappa:
        best_kappa = kappa
        best_importance = importance
        print(candidate + ': kappa=' + str(best_kappa))
    else:
        print(candidate + ' skipped')

print('Finish!')
best_importance

ngram_1y_mean skipped
ngram_1y_max skipped
ngram_5y_mean skipped
ngram_5y_max skipped
ngram_10y_mean skipped
ngram_10y_max skipped
ngram_20y_max skipped
ngram_50y_max skipped
ngram_100y_max skipped
ngram_200y_mean skipped
ngram_200y_max skipped
ngram_400y_mean skipped
ngram_400y_max skipped
ngram_500y_mean skipped
Finish!


### Top-down

In [49]:
candidate_feature_list = ['ngram_1y_mean', 'ngram_1y_max',
                          'ngram_5y_mean', 'ngram_5y_max',
                          'ngram_10y_mean', 'ngram_10y_max',
                          'ngram_20y_mean', 'ngram_20y_max',
                          'ngram_50y_mean', 'ngram_50y_max',
                          'ngram_100y_mean', 'ngram_100y_max',
                          'ngram_200y_mean', 'ngram_200y_max',
                          'ngram_400y_mean', 'ngram_400y_max',
                          'ngram_500y_mean', 'ngram_500y_max']

In [42]:
training_features_0 = base_feature + candidate_feature_list

best_kappa, importance = new_feature_transfer_model_test(unseen_list, local_list, training_features_0, None, target)
print('kappa=' + str(best_kappa))
importance

kappa=0.5454277398666727


Unnamed: 0,feature,importance
3,depthfromtopsynset_normalised_x,0.327384
4,glosslength_normalised_x,0.123668
5,minwordlength_x,0.096627
2,nrpartrels_normalised_x,0.062742
25,ngram_500y_max,0.057736
7,polyscore_max_x,0.033729
23,ngram_400y_max,0.031416
1,nrhypos_x,0.024179
19,ngram_100y_max,0.023001
21,ngram_200y_max,0.022751


In [43]:
candidate_feature_list_1 = candidate_feature_list

best_kappa = 0.54543
best_importance = None
for idx in range(len(candidate_feature_list_1)):
    input_candidates = candidate_feature_list_1[0:idx] + candidate_feature_list_1[idx+1:]
    training_features_1 = base_feature + input_candidates
    kappa, importance = new_feature_transfer_model_test(unseen_list, local_list, training_features_1, None, target)
    if kappa > best_kappa:
        best_kappa = kappa
        best_importance = importance
        print('eliminate ' + candidate_feature_list_1[idx] + ': kappa=' + str(best_kappa))
    else:
        print('eliminate ' + candidate_feature_list_1[idx] + ' skipped')

print('Finish!')
best_importance

eliminate ngram_1y_mean: kappa=0.5607984344501324
eliminate ngram_1y_max skipped
eliminate ngram_5y_mean skipped
eliminate ngram_5y_max skipped
eliminate ngram_10y_mean skipped
eliminate ngram_10y_max skipped
eliminate ngram_20y_mean skipped
eliminate ngram_20y_max skipped
eliminate ngram_50y_mean skipped
eliminate ngram_50y_max skipped
eliminate ngram_100y_mean skipped
eliminate ngram_100y_max skipped
eliminate ngram_200y_mean skipped
eliminate ngram_200y_max skipped
eliminate ngram_400y_mean skipped
eliminate ngram_400y_max skipped
eliminate ngram_500y_mean skipped
eliminate ngram_500y_max skipped
Finish!


Unnamed: 0,feature,importance
3,depthfromtopsynset_normalised_x,0.320249
4,glosslength_normalised_x,0.126163
5,minwordlength_x,0.099407
2,nrpartrels_normalised_x,0.06474
24,ngram_500y_max,0.056364
7,polyscore_max_x,0.034213
22,ngram_400y_max,0.033618
18,ngram_100y_max,0.024503
1,nrhypos_x,0.023866
20,ngram_200y_max,0.021981


In [44]:
candidate_feature_list_1.remove('ngram_1y_mean')
candidate_feature_list_2 = candidate_feature_list_1

best_kappa = 0.56080
best_importance = None

for idx in range(len(candidate_feature_list_2)):
    input_candidates = candidate_feature_list_2[0:idx] + candidate_feature_list_2[idx+1:]
    training_features_2 = base_feature + input_candidates
    kappa, importance = new_feature_transfer_model_test(unseen_list, local_list, training_features_2, None, target)
    if kappa > best_kappa:
        best_kappa = kappa
        best_importance = importance
        print('eliminate ' + candidate_feature_list_2[idx] + ': kappa=' + str(best_kappa))
    else:
        print('eliminate ' + candidate_feature_list_2[idx] + ' skipped')

print('Finish!')
best_importance

eliminate ngram_1y_max skipped
eliminate ngram_5y_mean skipped
eliminate ngram_5y_max skipped
eliminate ngram_10y_mean skipped
eliminate ngram_10y_max skipped
eliminate ngram_20y_mean skipped
eliminate ngram_20y_max skipped
eliminate ngram_50y_mean skipped
eliminate ngram_50y_max: kappa=0.5638420936580545
eliminate ngram_100y_mean skipped
eliminate ngram_100y_max: kappa=0.565434553015804
eliminate ngram_200y_mean skipped
eliminate ngram_200y_max: kappa=0.5661291486650522
eliminate ngram_400y_mean skipped
eliminate ngram_400y_max skipped
eliminate ngram_500y_mean skipped
eliminate ngram_500y_max skipped
Finish!


Unnamed: 0,feature,importance
3,depthfromtopsynset_normalised_x,0.303483
4,glosslength_normalised_x,0.125231
5,minwordlength_x,0.096131
2,nrpartrels_normalised_x,0.066593
23,ngram_500y_max,0.060014
7,polyscore_max_x,0.036545
21,ngram_400y_max,0.036499
18,ngram_100y_max,0.024823
1,nrhypos_x,0.024126
16,ngram_50y_max,0.022817


In [45]:
candidate_feature_list_2.remove('ngram_200y_max')
candidate_feature_list_3 = candidate_feature_list_2

best_kappa = 0.56613
best_importance = None

for idx in range(len(candidate_feature_list_3)):
    input_candidates = candidate_feature_list_3[0:idx] + candidate_feature_list_3[idx+1:]
    training_features_3 = base_feature + input_candidates
    kappa, importance = new_feature_transfer_model_test(unseen_list, local_list, training_features_3, None, target)
    if kappa > best_kappa:
        best_kappa = kappa
        best_importance = importance
        print('eliminate ' + candidate_feature_list_3[idx] + ': kappa=' + str(best_kappa))
    else:
        print('eliminate ' + candidate_feature_list_3[idx] + ' skipped')

print('Finish!')
best_importance

eliminate ngram_1y_max skipped
eliminate ngram_5y_mean skipped
eliminate ngram_5y_max skipped
eliminate ngram_10y_mean skipped
eliminate ngram_10y_max skipped
eliminate ngram_20y_mean skipped
eliminate ngram_20y_max: kappa=0.5673287516952442
eliminate ngram_50y_mean skipped
eliminate ngram_50y_max skipped
eliminate ngram_100y_mean skipped
eliminate ngram_100y_max skipped
eliminate ngram_200y_mean skipped
eliminate ngram_400y_mean skipped
eliminate ngram_400y_max skipped
eliminate ngram_500y_mean skipped
eliminate ngram_500y_max skipped
Finish!


Unnamed: 0,feature,importance
3,depthfromtopsynset_normalised_x,0.308986
4,glosslength_normalised_x,0.128062
5,minwordlength_x,0.096385
2,nrpartrels_normalised_x,0.063676
22,ngram_500y_max,0.063057
20,ngram_400y_max,0.037596
7,polyscore_max_x,0.036322
17,ngram_100y_max,0.028247
1,nrhypos_x,0.024926
15,ngram_50y_max,0.023527


In [51]:
candidate_feature_list_3.remove('ngram_20y_max')
candidate_feature_list_4 = candidate_feature_list_3

best_kappa = 0.56733
best_importance = None

for idx in range(len(candidate_feature_list_4)):
    input_candidates = candidate_feature_list_4[0:idx] + candidate_feature_list_4[idx+1:]
    training_features_4 = base_feature + input_candidates
    kappa, importance = new_feature_transfer_model_test(unseen_list, local_list, training_features_4, None, target)
    if kappa > best_kappa:
        best_kappa = kappa
        best_importance = importance
        print('eliminate ' + candidate_feature_list_4[idx] + ': kappa=' + str(best_kappa))
    else:
        print('eliminate ' + candidate_feature_list_4[idx] + ' skipped')

print('Finish!')
best_importance

eliminate ngram_1y_max skipped
eliminate ngram_5y_mean skipped
eliminate ngram_5y_max skipped
eliminate ngram_10y_mean skipped
eliminate ngram_10y_max: kappa=0.5677018550015113
eliminate ngram_20y_mean skipped
eliminate ngram_50y_mean skipped
eliminate ngram_50y_max skipped
eliminate ngram_100y_mean skipped
eliminate ngram_100y_max skipped
eliminate ngram_200y_mean skipped
eliminate ngram_400y_mean skipped
eliminate ngram_400y_max skipped
eliminate ngram_500y_mean skipped
eliminate ngram_500y_max skipped
Finish!


Unnamed: 0,feature,importance
3,depthfromtopsynset_normalised_x,0.316314
4,glosslength_normalised_x,0.12769
5,minwordlength_x,0.103259
2,nrpartrels_normalised_x,0.063286
21,ngram_500y_max,0.062404
19,ngram_400y_max,0.035758
7,polyscore_max_x,0.035731
16,ngram_100y_max,0.028829
14,ngram_50y_max,0.025003
1,nrhypos_x,0.024197


In [52]:
candidate_feature_list_4.remove('ngram_10y_max')
candidate_feature_list_5 = candidate_feature_list_4

best_kappa = 0.56770
best_importance = None

for idx in range(len(candidate_feature_list_5)):
    input_candidates = candidate_feature_list_5[0:idx] + candidate_feature_list_5[idx+1:]
    training_features_5 = base_feature + input_candidates
    kappa, importance = new_feature_transfer_model_test(unseen_list, local_list, training_features_5, None, target)
    if kappa > best_kappa:
        best_kappa = kappa
        best_importance = importance
        print('eliminate ' + candidate_feature_list_5[idx] + ': kappa=' + str(best_kappa))
    else:
        print('eliminate ' + candidate_feature_list_5[idx] + ' skipped')

print('Finish!')
best_importance

eliminate ngram_1y_max skipped
eliminate ngram_5y_mean skipped
eliminate ngram_5y_max skipped
eliminate ngram_10y_mean skipped
eliminate ngram_20y_mean skipped
eliminate ngram_50y_mean skipped
eliminate ngram_50y_max skipped
eliminate ngram_100y_mean skipped
eliminate ngram_100y_max skipped
eliminate ngram_200y_mean: kappa=0.5791685392334059
eliminate ngram_400y_mean skipped
eliminate ngram_400y_max skipped
eliminate ngram_500y_mean skipped
eliminate ngram_500y_max skipped
Finish!


Unnamed: 0,feature,importance
3,depthfromtopsynset_normalised_x,0.320935
4,glosslength_normalised_x,0.125125
5,minwordlength_x,0.10098
2,nrpartrels_normalised_x,0.066373
20,ngram_500y_max,0.062737
7,polyscore_max_x,0.037328
18,ngram_400y_max,0.036271
16,ngram_100y_max,0.028851
14,ngram_50y_max,0.025109
1,nrhypos_x,0.024781


In [54]:
candidate_feature_list_5.remove('ngram_200y_mean')
candidate_feature_list_6 = candidate_feature_list_5

best_kappa = 0.57917
best_importance = None

for idx in range(len(candidate_feature_list_6)):
    input_candidates = candidate_feature_list_6[0:idx] + candidate_feature_list_6[idx+1:]
    training_features_6 = base_feature + input_candidates
    kappa, importance = new_feature_transfer_model_test(unseen_list, local_list, training_features_6, None, target)
    if kappa > best_kappa:
        best_kappa = kappa
        best_importance = importance
        print('eliminate ' + candidate_feature_list_6[idx] + ': kappa=' + str(best_kappa))
    else:
        print('eliminate ' + candidate_feature_list_6[idx] + ' skipped')

print('Finish!')
best_importance

eliminate ngram_1y_max skipped
eliminate ngram_5y_mean skipped
eliminate ngram_5y_max skipped
eliminate ngram_10y_mean skipped
eliminate ngram_20y_mean skipped
eliminate ngram_50y_mean skipped
eliminate ngram_50y_max skipped
eliminate ngram_100y_mean skipped
eliminate ngram_100y_max skipped
eliminate ngram_400y_mean skipped
eliminate ngram_400y_max skipped
eliminate ngram_500y_mean skipped
eliminate ngram_500y_max skipped
Finish!


### Grid search
run in Python script

In [None]:
grid_base = base_feature
grid_candidate = candidate_feature_list

best_kappa = 0
best_importance = None

print('Start...')
for candidate_num in range(len(grid_candidate)):
    candidate_num += 1
    candidate_sets = list(combinations(candidate_feature_list, candidate_num))
    for candidate_group in candidate_sets:
        candidate_group = list(candidate_group)
        training_features = grid_base + candidate_group
        kappa, importance = new_feature_transfer_model_test(unseen_list, local_list, training_features, None, target)
        if kappa > best_kappa:
            best_kappa = kappa
            best_importance = importance
            string = 'Features: ' + str(candidate_group) + ' kappa=' + str(best_kappa) + '\n'
            with open('./grid_search_TransferModel.txt', 'a+') as f:
                f.write(string)
            print(string)
with open('./grid_search_TransferModel.txt', 'a+') as f:
    f.write(importance)
print('Finish!')