In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, cohen_kappa_score, balanced_accuracy_score
from itertools import combinations

import warnings
warnings.filterwarnings("ignore")

In [3]:
# read the pre-processed data all agreed
structral_data = pd.read_csv('../Google_Ngrams/features_google_ngram.csv', index_col=None)
# bnc_data = pd.read_csv('../corpora_size/size_differential_features.csv', index_col=None)['bnc_100m_sum']/10000000000
base_feature = ['nrdirhypers_x',
                'nrhypos_x',
                'nrpartrels_normalised_x',
                'depthfromtopsynset_normalised_x',
                'glosslength_normalised_x',
                'minwordlength_x',
                'nroflemmas_x',
                'polyscore_max_x']

target = ['vote_x']

data = pd.concat([structral_data[['Synsets', 'domain_x']] ,structral_data[base_feature], structral_data[target]], axis=1)

data

Unnamed: 0,Synsets,domain_x,nrdirhypers_x,nrhypos_x,nrpartrels_normalised_x,depthfromtopsynset_normalised_x,glosslength_normalised_x,minwordlength_x,nroflemmas_x,polyscore_max_x,vote_x
0,Synset('adjustable_wrench.n.01'),tool,1,7,0.0,1.012903,0.563173,17,2,1,nb
1,Synset('allen_wrench.n.01'),tool,1,0,0.0,1.012903,0.391092,12,1,1,nb
2,Synset('alligator_wrench.n.01'),tool,1,0,0.0,1.012903,1.517437,16,1,1,nb
3,Synset('awl.n.01'),tool,1,2,15.7,0.911613,0.985552,3,1,1,b
4,Synset('backsaw.n.01'),tool,1,0,0.0,1.114194,1.110701,7,2,1,nb
...,...,...,...,...,...,...,...,...,...,...,...
834,Synset('ballet_skirt.n.01'),garm,1,0,0.0,0.947552,0.578283,4,2,2,nb
835,Synset('mess_jacket.n.01'),garm,1,0,0.0,1.158120,1.652238,11,3,1,nb
836,Synset('long_johns.n.01'),garm,1,0,0.0,1.052836,0.479149,10,1,1,nb
837,Synset('undies.n.01'),garm,1,0,0.0,1.158120,0.280880,6,1,1,nb


## GlobalModel Test

In [5]:
# split training set and testing set using K-Flod
def global_model_test(dataset, feature, target):
    K = 10
    random_seed = 7 # R
    data = dataset.reset_index()
    feature_list = feature
    X = data[feature_list]
    y = data[target]

    K_Flod = StratifiedKFold(n_splits=K, shuffle=True, random_state=random_seed)
    K_Flod.get_n_splits(X, y)
    cohen_kappa = []
    balanced_acc = []
    for train_index, test_index in K_Flod.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # SMOTE algorithm
        smote = SMOTE(random_state=random_seed, k_neighbors=2)
        X_train, y_train = smote.fit_resample(X_train, y_train)

        # define random forest model
        rf = RandomForestClassifier(random_state=random_seed, max_features='sqrt', n_estimators=1400, min_samples_split=2, min_samples_leaf=1, max_depth=50, oob_score=True, criterion='gini', bootstrap=True).fit(X_train, y_train)

        # predict and make score
        pipeline = make_pipeline(smote, rf)
        y_pred = pipeline.predict(X_test)

        kappa = cohen_kappa_score(y_test, y_pred)
        cohen_kappa.append(kappa)
        balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
        balanced_acc.append(balanced_accuracy)

    result_kappa = np.mean(cohen_kappa)
    results_acc = np.mean(balanced_acc)

    return result_kappa, results_acc

In [6]:
global_model_test(data, base_feature, target)

(0.6729967902004707, 0.8436527256764306)

## LocalModel Test

In [7]:
# generate the local dataframe for different domains
local_fruit = data.loc[data['domain_x']=='fruit']
local_tool = data.loc[data['domain_x']=='tool']
local_music = data.loc[data['domain_x']=='music']
local_furniture = data.loc[data['domain_x']=='furn']
local_garments = data.loc[data['domain_x']=='garm']

local_list = [local_fruit, local_tool, local_music, local_furniture, local_garments]

def local_model_test(dataset_list, base_feature, target):
    random_seed = 7 # R
    K = 10
    feature_list = base_feature

    local_kappa_list = []
    local_balancedAcc_list = []
    for dataset in dataset_list:
        X = dataset[feature_list]
        y = dataset[target]
        K_Flod = StratifiedKFold(n_splits=K, shuffle=True, random_state=random_seed)
        K_Flod.get_n_splits(X, y)
        cohen_kappa = []
        balanced_acc = []
        for train_index, test_index in K_Flod.split(X, y):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # SMOTE algorithm
            smote = SMOTE(random_state=random_seed, k_neighbors=2)
            X_train, y_train = smote.fit_resample(X_train, y_train)

            # define random forest model
            rf = RandomForestClassifier(random_state=random_seed, max_features='sqrt', n_estimators=1400, min_samples_split=2, min_samples_leaf=1, max_depth=50, oob_score=True, criterion='gini', bootstrap=True).fit(X_train, y_train)

            # predict and make score
            pipeline = make_pipeline(smote, rf)
            y_pred = pipeline.predict(X_test)

            kappa = cohen_kappa_score(y_test, y_pred)
            cohen_kappa.append(kappa)
            balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
            balanced_acc.append(balanced_accuracy)

        cohen_kappa_mean = np.mean(cohen_kappa)
        balanced_acc_mean = np.mean(balanced_acc)
        local_kappa_list.append(cohen_kappa_mean)
        local_balancedAcc_list.append(balanced_acc_mean)

    local_kappa = np.mean(local_kappa_list)
    local_acc = np.mean(local_balancedAcc_list)

    return local_kappa, local_acc

In [8]:
local_model_test(local_list, base_feature, target)

(0.6404582598586586, 0.8247974917239624)

## TransferModel Test

In [10]:
# generate the unseen(transfer) dataframe for different domains
unseen_fruit = data.loc[data['domain_x']!='fruit']
unseen_tool = data.loc[data['domain_x']!='tool']
unseen_music = data.loc[data['domain_x']!='music']
unseen_furniture = data.loc[data['domain_x']!='furn']
unseen_garments = data.loc[data['domain_x']!='garm']

unseen_list = [unseen_fruit, unseen_tool, unseen_music, unseen_furniture, unseen_garments]

def transfer_model_test(train_list, test_list, base_feature, target):
    random_seed = 7 # R
    transfer_kappa_list = []
    transfer_balancedAcc_list = []
    for counter in range(len(train_list)):
        training_data = train_list[counter].reset_index()
        testing_data = test_list[counter].reset_index()
        feature_list = base_feature
        X_train = training_data[feature_list]
        y_train = training_data[target]
        X_test = testing_data[feature_list]
        y_test = testing_data[target]

        # SMOTE algorithm
        smote = SMOTE(random_state=random_seed, k_neighbors=2)
        X_train, y_train = smote.fit_resample(X_train, y_train)

        # define random forest model
        rf = RandomForestClassifier(random_state=random_seed, max_features='sqrt', n_estimators=1400, min_samples_split=2, min_samples_leaf=1, max_depth=50, oob_score=True, criterion='gini', bootstrap=True).fit(X_train, y_train)

        # predict and make score
        pipeline = make_pipeline(smote, rf)
        y_pred = pipeline.predict(X_test)

        kappa = cohen_kappa_score(y_test, y_pred)
        balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

        transfer_kappa_list.append(kappa)
        transfer_balancedAcc_list.append(balanced_accuracy)

    transfer_kappa = np.mean(transfer_kappa_list), np.mean(transfer_balancedAcc_list)

    return transfer_kappa

In [11]:
transfer_model_test(unseen_list, local_list, base_feature, target)

(0.5207441018487409, 0.8024977638606605)