In [43]:
import pandas as pd
import numpy as np

from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import make_pipeline
from sklearn.metrics import cohen_kappa_score, balanced_accuracy_score, precision_recall_fscore_support

import warnings
warnings.filterwarnings("ignore")

In [44]:
# read the pre-processed data all agreed
data = pd.read_csv('GS_All_Agreed.csv', index_col=0)

# generate the local dataframe for different domains
local_fruit = data.loc[data['domain_x']=='fruit'].reset_index()
local_tool = data.loc[data['domain_x']=='tool'].reset_index()
local_music = data.loc[data['domain_x']=='music'].reset_index()
local_furniture = data.loc[data['domain_x']=='furn'].reset_index()
local_garments = data.loc[data['domain_x']=='garm'].reset_index()

unseen_fruit = data.loc[data['domain_x']!='fruit']
unseen_tool = data.loc[data['domain_x']!='tool']
unseen_music = data.loc[data['domain_x']!='music']
unseen_furniture = data.loc[data['domain_x']!='furn']
unseen_garments = data.loc[data['domain_x']!='garm']

In [45]:
# define features and target
features = ['ngrams_last_mean',
            'nrdirhypers_x',
            'nrhypos_x',
            'nrpartrels_normalised_x',
            'depthfromtopsynset_normalised_x',
            'glosslength_normalised_x',
            'minwordlength_x',
            'nroflemmas_x',
            'polyscore_max_x']
target = ['vote_x'] # nb / b

In [46]:
local_set = [local_fruit, local_tool, local_music, local_furniture, local_garments]
unseen_set = [unseen_fruit, unseen_tool, unseen_music, unseen_furniture, unseen_garments]
# specific corpora features
features_set = ['BNC_sum', 'CABNC_per100,000', 'KBNC_Sum', 'CHILDES_Sum_Rel']

In [47]:
def unseenModel(unseen, local, featureSet):
    # split training set and testing set using K-Flod
    random_seed = 7 # R

    X_train, X_test, y_train, y_test = unseen[featureSet], local[featureSet], unseen[target], local[target]

    # SMOTE algorithm
    smote = SMOTE(random_state=random_seed, k_neighbors=2)
    X_train, y_train = smote.fit_resample(X_train, y_train)

    # define random forest model
    rf = RandomForestClassifier(random_state=random_seed, max_features='sqrt', n_estimators=1400, min_samples_split=2, min_samples_leaf=1, max_depth=50, oob_score=True, criterion='gini', bootstrap=True).fit(X_train, y_train)

    # predict and make score
    pipeline = make_pipeline(smote, rf)
    y_pred = pipeline.predict(X_test)

    kappa = cohen_kappa_score(y_test, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

    chart = precision_recall_fscore_support(y_test, y_pred)
    results = pd.DataFrame(chart).transpose()

    results['corpora'] = featureSet[-1]
    results['cohen kappa'] = kappa
    results['balanced acc'] = balanced_accuracy
    results['unseen'] = list(local['domain_x'])[0]

    results = results.rename(index={0:"b", 1: 'nb'}, columns={0:'precision', 1:'recall', 2:'f1-score', 3:'support'})

    return results

In [54]:
res = pd.DataFrame(columns=['precision', 'recall', 'f1-score', 'support', 'corpora', 'cohen kappa', 'balanced acc', 'unseen'])

i = 4
for cor in features_set:
    features.append(cor)
    res_plus = unseenModel(unseen_set[i], local_set[i], features)
    res = res.append(res_plus)
    features.pop()

res

Unnamed: 0,precision,recall,f1-score,support,corpora,cohen kappa,balanced acc,unseen
b,0.310345,0.692308,0.428571,26.0,BNC_sum,0.328535,0.753131,garm
nb,0.956284,0.813953,0.879397,215.0,BNC_sum,0.328535,0.753131,garm
b,0.292308,0.730769,0.417582,26.0,"CABNC_per100,000",0.311466,0.758408,garm
nb,0.960227,0.786047,0.86445,215.0,"CABNC_per100,000",0.311466,0.758408,garm
b,0.263889,0.730769,0.387755,26.0,KBNC_Sum,0.272416,0.742129,garm
nb,0.95858,0.753488,0.84375,215.0,KBNC_Sum,0.272416,0.742129,garm
b,0.287671,0.807692,0.424242,26.0,CHILDES_Sum_Rel,0.315307,0.782916,garm
nb,0.970238,0.75814,0.851175,215.0,CHILDES_Sum_Rel,0.315307,0.782916,garm
