In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, cohen_kappa_score, balanced_accuracy_score

import warnings
warnings.filterwarnings("ignore")

In [3]:
# preparing for the dataset
ngram_dataset = pd.read_csv('./features_google_ngram.csv', index_col=None)
ngram_dataset

Unnamed: 0,Synsets,domain_x,nrdirhypers_x,nrhypos_x,nrpartrels_normalised_x,depthfromtopsynset_normalised_x,glosslength_normalised_x,minwordlength_x,nroflemmas_x,polyscore_max_x,...,ngram_50y_mean,ngram_50y_max,ngram_100y_mean,ngram_100y_max,ngram_200y_mean,ngram_200y_max,ngram_400y_mean,ngram_400y_max,ngram_500y_mean,ngram_500y_max
0,Synset('adjustable_wrench.n.01'),tool,1,7,0.0,1.012903,0.563173,17,2,1,...,2.667275e-08,5.067315e-08,2.252890e-08,5.067315e-08,1.380492e-08,5.067315e-08,6.919673e-09,5.067315e-08,5.336133e-09,5.067315e-08
1,Synset('allen_wrench.n.01'),tool,1,0,0.0,1.012903,0.391092,12,1,1,...,2.025710e-08,6.483798e-08,1.515313e-08,6.483798e-08,7.715051e-09,6.483798e-08,3.867145e-09,6.483798e-08,2.982164e-09,6.483798e-08
2,Synset('alligator_wrench.n.01'),tool,1,0,0.0,1.012903,1.517437,16,1,1,...,1.955061e-10,5.970550e-10,6.184165e-10,7.804026e-09,1.237592e-09,1.499339e-08,6.203393e-10,1.499339e-08,4.783770e-10,1.499339e-08
3,Synset('awl.n.01'),tool,1,2,15.7,0.911613,0.985552,3,1,1,...,2.895038e-07,3.752697e-07,3.562540e-07,6.002532e-07,5.869817e-07,2.510665e-06,6.210211e-07,6.602901e-06,5.069033e-07,6.602901e-06
4,Synset('backsaw.n.01'),tool,1,0,0.0,1.114194,1.110701,7,2,1,...,1.620242e-08,2.446695e-08,1.704462e-08,2.446695e-08,1.451245e-08,3.507272e-08,9.018908e-09,2.260505e-07,6.954965e-09,2.260505e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
834,Synset('ballet_skirt.n.01'),garm,1,0,0.0,0.947552,0.578283,4,2,2,...,4.059229e-07,8.004256e-07,2.485961e-07,8.004256e-07,1.531570e-07,8.004256e-07,1.136149e-07,8.004256e-07,9.223624e-08,1.686392e-06
835,Synset('mess_jacket.n.01'),garm,1,0,0.0,1.158120,1.652238,11,3,1,...,9.969528e-09,8.272560e-09,1.233276e-08,2.136213e-08,1.358353e-08,5.083992e-08,6.963715e-09,5.547362e-08,5.370096e-09,5.547362e-08
836,Synset('long_johns.n.01'),garm,1,0,0.0,1.052836,0.479149,10,1,1,...,4.974292e-08,1.103470e-07,2.680743e-08,1.103470e-07,1.393969e-08,1.103470e-07,6.987224e-09,1.103470e-07,5.388225e-09,1.103470e-07
837,Synset('undies.n.01'),garm,1,0,0.0,1.158120,0.280880,6,1,1,...,7.986401e-08,2.565946e-07,5.787213e-08,2.565946e-07,3.095668e-08,2.565946e-07,2.025520e-08,3.896998e-07,1.561987e-08,3.896998e-07


In [4]:
# define features and target
features_base = ['nrdirhypers_x',
            'nrhypos_x',
            'nrpartrels_normalised_x',
            'depthfromtopsynset_normalised_x',
            'glosslength_normalised_x',
            'minwordlength_x',
            'nroflemmas_x',
            'polyscore_max_x']
target = ['vote_x'] # nb / b

## GlobalModel Test


In [4]:
# split training set and testing set using K-Flod
def global_model_test(dataset, feature, year_period, target):
    K = 10
    random_seed = 7 # R
    data = dataset.reset_index()
    feature_list = feature + [year_period]
    X = data[feature_list]
    y = data[target]

    K_Flod = StratifiedKFold(n_splits=K, shuffle=True, random_state=random_seed)
    K_Flod.get_n_splits(X, y)
    cohen_kappa = []
    balanced_acc = []
    for train_index, test_index in K_Flod.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # SMOTE algorithm
        smote = SMOTE(random_state=random_seed, k_neighbors=2)
        X_train, y_train = smote.fit_resample(X_train, y_train)

        # define random forest model
        rf = RandomForestClassifier(random_state=random_seed, max_features='sqrt', n_estimators=1400, min_samples_split=2, min_samples_leaf=1, max_depth=50, oob_score=True, criterion='gini', bootstrap=True).fit(X_train, y_train)

        # predict and make score
        pipeline = make_pipeline(smote, rf)
        y_pred = pipeline.predict(X_test)

        kappa = cohen_kappa_score(y_test, y_pred)
        cohen_kappa.append(kappa)
        balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
        balanced_acc.append(balanced_accuracy)

    results = []

    results.append(np.mean(cohen_kappa))
    results.append(np.mean(balanced_acc))

    return results

In [5]:
global_model_test(ngram_dataset, features_base, 'ngram_10y_mean', target)

[0.6889932540972804, 0.8458398737152029]

In [9]:
ngram_feature_list = ['ngram_1y_mean', 'ngram_1y_max',
                      'ngram_5y_mean', 'ngram_5y_max',
                      'ngram_10y_mean', 'ngram_10y_max',
                      'ngram_20y_mean', 'ngram_20y_max',
                      'ngram_50y_mean', 'ngram_50y_max',
                      'ngram_100y_mean', 'ngram_100y_max',
                      'ngram_200y_mean', 'ngram_200y_max',
                      'ngram_400y_mean', 'ngram_400y_max',
                      'ngram_500y_mean', 'ngram_500y_max']

In [None]:
result_kappa_dict = {'mean': [], 'max': []}
result_balancedAcc_dict = {'mean': [], 'max': []}
for counter in range(len(ngram_feature_list)):
    result_list = global_model_test(ngram_dataset, features_base, ngram_feature_list[counter], target)
    if counter % 2 == 0:
        result_kappa_dict['mean'].append(result_list[0])
        result_balancedAcc_dict['mean'].append(result_list[1])
    else:
        result_kappa_dict['max'].append(result_list[0])
        result_balancedAcc_dict['max'].append(result_list[1])

np.save('./resultDict/GlobalModel_Test_kappa.npy', result_kappa_dict, allow_pickle=True)
np.save('./resultDict/GlobalModel_Test_balanced_acc.npy', result_balancedAcc_dict, allow_pickle=True)

In [16]:
result_kappa_dict

{'mean': [0.6895131395336889,
  0.6819311995908929,
  0.6889932540972804,
  0.6894450582550637,
  0.6792460904959001,
  0.670646830216902,
  0.6832672806178925,
  0.6741536864265807,
  0.6821740987639083],
 'max': [0.6863100162923843,
  0.6974403198307233,
  0.6935090842995564,
  0.6924517230386286,
  0.6897699544263008,
  0.6996241732892688,
  0.6958277302930723,
  0.6831921751429977,
  0.6863755367076694]}

In [18]:
np.load('./resultDict/GlobalModel_Test_kappa.npy', allow_pickle=True).item()

{'mean': [0.6895131395336889,
  0.6819311995908929,
  0.6889932540972804,
  0.6894450582550637,
  0.6792460904959001,
  0.670646830216902,
  0.6832672806178925,
  0.6741536864265807,
  0.6821740987639083],
 'max': [0.6863100162923843,
  0.6974403198307233,
  0.6935090842995564,
  0.6924517230386286,
  0.6897699544263008,
  0.6996241732892688,
  0.6958277302930723,
  0.6831921751429977,
  0.6863755367076694]}

## LocalModel Test

In [6]:
# generate the local dataframe for different domains
local_fruit = ngram_dataset.loc[ngram_dataset['domain_x']=='fruit']
local_tool = ngram_dataset.loc[ngram_dataset['domain_x']=='tool']
local_music = ngram_dataset.loc[ngram_dataset['domain_x']=='music']
local_furniture = ngram_dataset.loc[ngram_dataset['domain_x']=='furn']
local_garments = ngram_dataset.loc[ngram_dataset['domain_x']=='garm']

local_list = [local_fruit, local_tool, local_music, local_furniture, local_garments]

# average results of five domains
def local_model_test(dataset_list, feature, year_period, target):
    local_kappa_list = []
    local_balancedAcc_list = []
    for local_dataset in dataset_list:
        result_list = global_model_test(local_dataset, feature, year_period, target)
        local_kappa_list.append(result_list[0])
        local_balancedAcc_list.append(result_list[1])

    local_result_list = [np.mean(local_kappa_list), np.mean(local_balancedAcc_list)]
    return local_result_list

In [13]:
local_model_test(local_list, features_base, 'ngram_10y_mean', target)

[0.6709395758983121, 0.8410339954163485]

In [None]:
result_local_kappa_dict = {'mean': [], 'max': []}
result_local_balancedAcc_dict = {'mean': [], 'max': []}
for counter in range(len(ngram_feature_list)):
    result_list = local_model_test(local_list, features_base, ngram_feature_list[counter], target)
    if counter % 2 == 0:
        result_local_kappa_dict['mean'].append(result_list[0])
        result_local_balancedAcc_dict['mean'].append(result_list[1])
    else:
        result_local_kappa_dict['max'].append(result_list[0])
        result_local_balancedAcc_dict['max'].append(result_list[1])

np.save('./resultDict/LocallModel_Test_kappa.npy', result_local_kappa_dict, allow_pickle=True)
np.save('./resultDict/LocalModel_Test_balanced_acc.npy', result_local_balancedAcc_dict, allow_pickle=True)

In [20]:
result_local_kappa_dict

{'mean': [0.6567146798792984,
  0.6724061180707366,
  0.6709395758983121,
  0.6798979095646819,
  0.6715040349208072,
  0.6774458425156835,
  0.6642376222592066,
  0.6646532349454074,
  0.6620570810992537],
 'max': [0.6567146798792984,
  0.6563639250405917,
  0.6712321684909046,
  0.6689357328525052,
  0.6732312560304884,
  0.6700983550264065,
  0.6666929725206334,
  0.664470135293054,
  0.6747046841570868]}

## TransferModel Test

In [7]:
# generate the unseen(transfer) dataframe for different domains
unseen_fruit = ngram_dataset.loc[ngram_dataset['domain_x']!='fruit']
unseen_tool = ngram_dataset.loc[ngram_dataset['domain_x']!='tool']
unseen_music = ngram_dataset.loc[ngram_dataset['domain_x']!='music']
unseen_furniture = ngram_dataset.loc[ngram_dataset['domain_x']!='furn']
unseen_garments = ngram_dataset.loc[ngram_dataset['domain_x']!='garm']

unseen_list = [unseen_fruit, unseen_tool, unseen_music, unseen_furniture, unseen_garments]

def transfer_model_test(train_list, test_list, feature, year_period, target):
    random_seed = 7 # R
    transfer_kappa_list = []
    transfer_balancedAcc_list = []
    for counter in range(len(train_list)):
        training_data = train_list[counter].reset_index()
        testing_data = test_list[counter].reset_index()
        feature_list = feature + [year_period]
        X_train = training_data[feature_list]
        y_train = training_data[target]
        X_test = testing_data[feature_list]
        y_test = testing_data[target]

        # SMOTE algorithm
        smote = SMOTE(random_state=random_seed, k_neighbors=2)
        X_train, y_train = smote.fit_resample(X_train, y_train)

        # define random forest model
        rf = RandomForestClassifier(random_state=random_seed, max_features='sqrt', n_estimators=1400, min_samples_split=2, min_samples_leaf=1, max_depth=50, oob_score=True, criterion='gini', bootstrap=True).fit(X_train, y_train)

        # predict and make score
        pipeline = make_pipeline(smote, rf)
        y_pred = pipeline.predict(X_test)

        kappa = cohen_kappa_score(y_test, y_pred)
        balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

        transfer_kappa_list.append(kappa)
        transfer_balancedAcc_list.append(balanced_accuracy)

    transfer_results = [np.mean(transfer_kappa_list), np.mean(transfer_balancedAcc_list)]

    return transfer_results

In [9]:
transfer_model_test(unseen_list, local_list, features_base, 'ngram_10y_mean', target)

[0.5143881523591414, 0.7994906201379257]

In [2]:
result_transfer_kappa_dict = {'mean': [], 'max': []}
result_transfer_balancedAcc_dict = {'mean': [], 'max': []}
for counter in range(len(ngram_feature_list)):
    result_list = transfer_model_test(unseen_list, local_list, features_base, ngram_feature_list[counter], target)
    if counter % 2 == 0:
        result_transfer_kappa_dict['mean'].append(result_list[0])
        result_transfer_balancedAcc_dict['mean'].append(result_list[1])
    else:
        result_transfer_kappa_dict['max'].append(result_list[0])
        result_transfer_balancedAcc_dict['max'].append(result_list[1])

np.save('./resultDict/TransferModel_Test_kappa.npy', result_local_kappa_dict, allow_pickle=True)
np.save('./resultDict/TransferModel_Test_balanced_acc.npy', result_local_balancedAcc_dict, allow_pickle=True)

NameError: name 'ngram_feature_list' is not defined

In [12]:
np.save('./resultDict/TransferModel_Test_kappa.npy', result_transfer_kappa_dict, allow_pickle=True)
np.save('./resultDict/TransferModel_Test_balanced_acc.npy', result_transfer_balancedAcc_dict, allow_pickle=True)

In [14]:
result_transfer_balancedAcc_dict

{'mean': [0.8012823577114185,
  0.7994100172858866,
  0.7994906201379257,
  0.8090529575073312,
  0.8070534017778341,
  0.8096072322785901,
  0.8015649334468173,
  0.7968861624641281,
  0.8005593258284028],
 'max': [0.8029526850251582,
  0.7998985804659886,
  0.7978959710769363,
  0.8038465068312959,
  0.802898041832595,
  0.807515347964346,
  0.8029281008817364,
  0.7986516512985293,
  0.7949540599272693]}