In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, cohen_kappa_score, balanced_accuracy_score

import warnings
warnings.filterwarnings("ignore")

In [3]:
# read the pre-processed data all agreed
data = pd.read_csv('./size_differential_features.csv', index_col=0)

# generate the local dataframe for different domains
local_fruit = data.loc[data['domain_x']=='fruit'].reset_index()
local_tool = data.loc[data['domain_x']=='tool'].reset_index()
local_music = data.loc[data['domain_x']=='music'].reset_index()
local_furniture = data.loc[data['domain_x']=='furn'].reset_index()
local_garments = data.loc[data['domain_x']=='garm'].reset_index()

data

Unnamed: 0_level_0,Synsets,domain_x,ngrams_last_mean,nrdirhypers_x,nrhypos_x,nrpartrels_normalised_x,depthfromtopsynset_normalised_x,glosslength_normalised_x,minwordlength_x,nroflemmas_x,...,kbnc_1m_sum,childes_1m_rel_sum,childes_2_4m_rel_sum,childes_5_7m_rel_sum,bnc_1m_sum,bnc_5_7m_sum,bnc_2_4m_sum,bnc_100m_sum,cabnc_per_100k_2_4m,cabnc_per_100k_1m
norm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
adjustable_wrench,Synset('adjustable_wrench.n.01'),tool,-18.854402,1,7,0.0,1.012903,0.563173,17,2,...,0,0.0,0.0,0.000000e+00,0,0,0,0,0,0
allen_wrench,Synset('allen_wrench.n.01'),tool,-18.389126,1,0,0.0,1.012903,0.391092,12,1,...,0,0.0,0.0,0.000000e+00,0,0,0,0,0,0
alligator_wrench,Synset('alligator_wrench.n.01'),tool,-22.354690,1,0,0.0,1.012903,1.517437,16,1,...,0,0.0,0.0,0.000000e+00,0,0,0,0,0,0
awl,Synset('awl.n.01'),tool,-14.809997,1,2,15.7,0.911613,0.985552,3,1,...,0,0.0,0.0,0.000000e+00,1,3,1,39,0,0
backsaw,Synset('backsaw.n.01'),tool,-18.327476,1,0,0.0,1.114194,1.110701,7,2,...,0,0.0,0.0,0.000000e+00,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ballet_skirt,Synset('ballet_skirt.n.01'),garm,-16.872435,1,0,0.0,0.947552,0.578283,4,2,...,0,0.0,0.0,1.754386e-07,0,0,0,33,0,0
mess_jacket,Synset('mess_jacket.n.01'),garm,-18.743772,1,0,0.0,1.158120,1.652238,11,3,...,0,0.0,0.0,0.000000e+00,0,0,0,0,0,0
long_johns,Synset('long_johns.n.01'),garm,-16.882419,1,0,0.0,1.052836,0.479149,10,1,...,0,0.0,0.0,0.000000e+00,0,0,0,0,0,0
undies,Synset('undies.n.01'),garm,-16.330160,1,0,0.0,1.158120,0.280880,6,1,...,0,0.0,0.0,0.000000e+00,1,1,1,33,0,0


In [3]:
# define features and target
features = ['ngrams_last_mean',
            'nrdirhypers_x',
            'nrhypos_x',
            'nrpartrels_normalised_x',
            'depthfromtopsynset_normalised_x',
            'glosslength_normalised_x',
            'minwordlength_x',
            'nroflemmas_x',
            'polyscore_max_x']
target = ['vote_x'] # nb / b

In [15]:
# split training set and testing set using K-Flod
def global_model_test(dataset, feature, sized_corpora, target):
    K = 10
    random_seed = 7 # R
    data = dataset.reset_index()
    feature_list = feature + [sized_corpora]
    X = data[feature_list]
    y = data[target]

    K_Flod = StratifiedKFold(n_splits=K, shuffle=True, random_state=random_seed)
    K_Flod.get_n_splits(X, y)
    cohen_kappa = []
    balanced_acc = []
    for train_index, test_index in K_Flod.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # SMOTE algorithm
        smote = SMOTE(random_state=random_seed, k_neighbors=2)
        X_train, y_train = smote.fit_resample(X_train, y_train)

        # define random forest model
        rf = RandomForestClassifier(random_state=random_seed, max_features='sqrt', n_estimators=1400, min_samples_split=2, min_samples_leaf=1, max_depth=50, oob_score=True, criterion='gini', bootstrap=True).fit(X_train, y_train)

        # predict and make score
        pipeline = make_pipeline(smote, rf)
        y_pred = pipeline.predict(X_test)

        kappa = cohen_kappa_score(y_test, y_pred)
        cohen_kappa.append(kappa)
        balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
        balanced_acc.append(balanced_accuracy)

    results = classification_report(y_test, y_pred, output_dict=True)
    results = pd.DataFrame(results).transpose()

    results['cohen kappa / 10'] = np.mean(cohen_kappa)
    results['balanced acc / 10'] = np.mean(balanced_acc)
    results['global'] = 5

    # importance of features
    importance = rf.feature_importances_
    importance = pd.DataFrame([features, importance]).transpose()
    importance = importance.rename(columns={0:'feature', 1:'importance'}).sort_values('importance', ascending=False)

    return results, importance

## KBNC: 1 million

In [16]:
sized_corpora = 'kbnc_1m_sum'
global_model_test(data, features, sized_corpora, target)

(              precision    recall  f1-score    support  cohen kappa / 10  \
 b              0.823529  0.823529  0.823529  17.000000          0.691517   
 nb             0.954545  0.954545  0.954545  66.000000          0.691517   
 accuracy       0.927711  0.927711  0.927711   0.927711          0.691517   
 macro avg      0.889037  0.889037  0.889037  83.000000          0.691517   
 weighted avg   0.927711  0.927711  0.927711  83.000000          0.691517   
 
               balanced acc / 10  global  
 b                       0.85023       5  
 nb                      0.85023       5  
 accuracy                0.85023       5  
 macro avg               0.85023       5  
 weighted avg            0.85023       5  ,
                            feature importance
 4  depthfromtopsynset_normalised_x   0.370331
 5         glosslength_normalised_x   0.137375
 0                 ngrams_last_mean   0.133441
 6                  minwordlength_x   0.109875
 3          nrpartrels_normalised_x   0.10

## CABNC: 1 million, 2.4 million

In [17]:
sized_corpora = 'cabnc_per_100k_1m'
global_model_test(data, features, sized_corpora, target)

(              precision    recall  f1-score    support  cohen kappa / 10  \
 b              0.736842  0.823529  0.777778  17.000000          0.685486   
 nb             0.953125  0.924242  0.938462  66.000000          0.685486   
 accuracy       0.903614  0.903614  0.903614   0.903614          0.685486   
 macro avg      0.844984  0.873886  0.858120  83.000000          0.685486   
 weighted avg   0.908826  0.903614  0.905550  83.000000          0.685486   
 
               balanced acc / 10  global  
 b                      0.851834       5  
 nb                     0.851834       5  
 accuracy               0.851834       5  
 macro avg              0.851834       5  
 weighted avg           0.851834       5  ,
                            feature importance
 4  depthfromtopsynset_normalised_x   0.373534
 5         glosslength_normalised_x   0.138268
 0                 ngrams_last_mean   0.133284
 3          nrpartrels_normalised_x   0.106026
 6                  minwordlength_x   0.10

In [18]:
sized_corpora = 'cabnc_per_100k_2_4m'
global_model_test(data, features, sized_corpora, target)

(              precision    recall  f1-score    support  cohen kappa / 10  \
 b              0.789474  0.882353  0.833333  17.000000          0.699077   
 nb             0.968750  0.939394  0.953846  66.000000          0.699077   
 accuracy       0.927711  0.927711  0.927711   0.927711          0.699077   
 macro avg      0.879112  0.910873  0.893590  83.000000          0.699077   
 weighted avg   0.932031  0.927711  0.929163  83.000000          0.699077   
 
               balanced acc / 10  global  
 b                      0.853742       5  
 nb                     0.853742       5  
 accuracy               0.853742       5  
 macro avg              0.853742       5  
 weighted avg           0.853742       5  ,
                            feature importance
 4  depthfromtopsynset_normalised_x   0.370363
 5         glosslength_normalised_x   0.149594
 0                 ngrams_last_mean   0.130475
 6                  minwordlength_x   0.106714
 3          nrpartrels_normalised_x   0.10

## CHILDES: 5.7 million, 2.4 million, 1 million

In [19]:
sized_corpora = 'childes_1m_rel_sum'
global_model_test(data, features, sized_corpora, target)

(              precision    recall  f1-score    support  cohen kappa / 10  \
 b              0.842105  0.941176  0.888889  17.000000          0.685825   
 nb             0.984375  0.954545  0.969231  66.000000          0.685825   
 accuracy       0.951807  0.951807  0.951807   0.951807          0.685825   
 macro avg      0.913240  0.947861  0.929060  83.000000          0.685825   
 weighted avg   0.955235  0.951807  0.952775  83.000000          0.685825   
 
               balanced acc / 10  global  
 b                      0.854018       5  
 nb                     0.854018       5  
 accuracy               0.854018       5  
 macro avg              0.854018       5  
 weighted avg           0.854018       5  ,
                            feature importance
 4  depthfromtopsynset_normalised_x   0.363085
 5         glosslength_normalised_x   0.133649
 0                 ngrams_last_mean   0.123163
 6                  minwordlength_x   0.106142
 3          nrpartrels_normalised_x   0.08

In [20]:
sized_corpora = 'childes_2_4m_rel_sum'
global_model_test(data, features, sized_corpora, target)

(              precision    recall  f1-score    support  cohen kappa / 10  \
 b              0.842105  0.941176  0.888889  17.000000          0.683559   
 nb             0.984375  0.954545  0.969231  66.000000          0.683559   
 accuracy       0.951807  0.951807  0.951807   0.951807          0.683559   
 macro avg      0.913240  0.947861  0.929060  83.000000          0.683559   
 weighted avg   0.955235  0.951807  0.952775  83.000000          0.683559   
 
               balanced acc / 10  global  
 b                      0.853283       5  
 nb                     0.853283       5  
 accuracy               0.853283       5  
 macro avg              0.853283       5  
 weighted avg           0.853283       5  ,
                            feature importance
 4  depthfromtopsynset_normalised_x   0.367108
 5         glosslength_normalised_x    0.13707
 0                 ngrams_last_mean   0.128761
 6                  minwordlength_x   0.108701
 3          nrpartrels_normalised_x   0.08

In [21]:
sized_corpora = 'childes_5_7m_rel_sum'
global_model_test(data, features, sized_corpora, target)

(              precision    recall  f1-score    support  cohen kappa / 10  \
 b              0.833333  0.882353  0.857143  17.000000          0.660334   
 nb             0.969231  0.954545  0.961832  66.000000          0.660334   
 accuracy       0.939759  0.939759  0.939759   0.939759          0.660334   
 macro avg      0.901282  0.918449  0.909487  83.000000          0.660334   
 weighted avg   0.941396  0.939759  0.940390  83.000000          0.660334   
 
               balanced acc / 10  global  
 b                      0.840341       5  
 nb                     0.840341       5  
 accuracy               0.840341       5  
 macro avg              0.840341       5  
 weighted avg           0.840341       5  ,
                            feature importance
 4  depthfromtopsynset_normalised_x   0.364524
 5         glosslength_normalised_x   0.136819
 0                 ngrams_last_mean   0.131156
 6                  minwordlength_x    0.11026
 3          nrpartrels_normalised_x    0.0

## BNC: 100 million, 5.7 million, 2.4 million, 1 million

In [22]:
sized_corpora = 'bnc_1m_sum'
global_model_test(data, features, sized_corpora, target)

(              precision    recall  f1-score    support  cohen kappa / 10  \
 b              0.800000  0.941176  0.864865  17.000000          0.698233   
 nb             0.984127  0.939394  0.961240  66.000000          0.698233   
 accuracy       0.939759  0.939759  0.939759   0.939759          0.698233   
 macro avg      0.892063  0.940285  0.913053  83.000000          0.698233   
 weighted avg   0.946414  0.939759  0.941501  83.000000          0.698233   
 
               balanced acc / 10  global  
 b                      0.853742       5  
 nb                     0.853742       5  
 accuracy               0.853742       5  
 macro avg              0.853742       5  
 weighted avg           0.853742       5  ,
                            feature importance
 4  depthfromtopsynset_normalised_x   0.368128
 5         glosslength_normalised_x   0.138929
 0                 ngrams_last_mean   0.129226
 3          nrpartrels_normalised_x   0.109587
 6                  minwordlength_x    0.1

In [23]:
sized_corpora = 'bnc_2_4m_sum'
global_model_test(data, features, sized_corpora, target)

(              precision    recall  f1-score    support  cohen kappa / 10  \
 b              0.789474  0.882353  0.833333  17.000000          0.709009   
 nb             0.968750  0.939394  0.953846  66.000000          0.709009   
 accuracy       0.927711  0.927711  0.927711   0.927711          0.709009   
 macro avg      0.879112  0.910873  0.893590  83.000000          0.709009   
 weighted avg   0.932031  0.927711  0.929163  83.000000          0.709009   
 
               balanced acc / 10  global  
 b                      0.860044       5  
 nb                     0.860044       5  
 accuracy               0.860044       5  
 macro avg              0.860044       5  
 weighted avg           0.860044       5  ,
                            feature importance
 4  depthfromtopsynset_normalised_x   0.359973
 5         glosslength_normalised_x   0.141079
 0                 ngrams_last_mean   0.126889
 3          nrpartrels_normalised_x    0.11451
 6                  minwordlength_x   0.10

In [24]:
sized_corpora = 'bnc_5_7m_sum'
global_model_test(data, features, sized_corpora, target)

(              precision    recall  f1-score    support  cohen kappa / 10  \
 b              0.888889  0.941176  0.914286  17.000000          0.712823   
 nb             0.984615  0.969697  0.977099  66.000000          0.712823   
 accuracy       0.963855  0.963855  0.963855   0.963855          0.712823   
 macro avg      0.936752  0.955437  0.945692  83.000000          0.712823   
 weighted avg   0.965009  0.963855  0.964234  83.000000          0.712823   
 
               balanced acc / 10  global  
 b                      0.860965       5  
 nb                     0.860965       5  
 accuracy               0.860965       5  
 macro avg              0.860965       5  
 weighted avg           0.860965       5  ,
                            feature importance
 4  depthfromtopsynset_normalised_x   0.356451
 5         glosslength_normalised_x   0.132684
 0                 ngrams_last_mean   0.128223
 3          nrpartrels_normalised_x   0.117791
 6                  minwordlength_x   0.10

In [25]:
sized_corpora = 'bnc_100m_sum'
global_model_test(data, features, sized_corpora, target)

(              precision    recall  f1-score    support  cohen kappa / 10  \
 b              0.842105  0.941176  0.888889  17.000000          0.714009   
 nb             0.984375  0.954545  0.969231  66.000000          0.714009   
 accuracy       0.951807  0.951807  0.951807   0.951807          0.714009   
 macro avg      0.913240  0.947861  0.929060  83.000000          0.714009   
 weighted avg   0.955235  0.951807  0.952775  83.000000          0.714009   
 
               balanced acc / 10  global  
 b                      0.859528       5  
 nb                     0.859528       5  
 accuracy               0.859528       5  
 macro avg              0.859528       5  
 weighted avg           0.859528       5  ,
                            feature importance
 4  depthfromtopsynset_normalised_x   0.340294
 3          nrpartrels_normalised_x   0.141376
 5         glosslength_normalised_x   0.125519
 0                 ngrams_last_mean   0.117697
 6                  minwordlength_x   0.09

## Tset: next stage feature engineering

In [4]:
# read the pre-processed data all agreed
data = pd.read_csv('./next_stage_size_differential_features.csv', index_col=0)

# generate the local dataframe for different domains
local_fruit = data.loc[data['domain_x']=='fruit'].reset_index()
local_tool = data.loc[data['domain_x']=='tool'].reset_index()
local_music = data.loc[data['domain_x']=='music'].reset_index()
local_furniture = data.loc[data['domain_x']=='furn'].reset_index()
local_garments = data.loc[data['domain_x']=='garm'].reset_index()

data

Unnamed: 0_level_0,Synsets,domain_x,ngrams_last_mean,nrdirhypers_x,nrhypos_x,nrpartrels_normalised_x,depthfromtopsynset_normalised_x,glosslength_normalised_x,minwordlength_x,nroflemmas_x,...,min_5_7m,min_100m,sum_1m,sum_2_4m,sum_5_7m,sum_100m,avg_bnc,avg_childes,avg_cabnc,avg_kbnc
norm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
adjustable_wrench,Synset('adjustable_wrench.n.01'),tool,-18.854402,1,7,0.0,1.012903,0.563173,17,2,...,0,0,0,0,0,0,0.00,0.000000,0.0,0.0
allen_wrench,Synset('allen_wrench.n.01'),tool,-18.389126,1,0,0.0,1.012903,0.391092,12,1,...,0,0,0,0,0,0,0.00,0.000000,0.0,0.0
alligator_wrench,Synset('alligator_wrench.n.01'),tool,-22.354690,1,0,0.0,1.012903,1.517437,16,1,...,0,0,0,0,0,0,0.00,0.000000,0.0,0.0
awl,Synset('awl.n.01'),tool,-14.809997,1,2,15.7,0.911613,0.985552,3,1,...,0,39,1,1,3,39,11.00,0.000000,0.0,0.0
backsaw,Synset('backsaw.n.01'),tool,-18.327476,1,0,0.0,1.114194,1.110701,7,2,...,0,0,0,0,0,0,0.00,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ballet_skirt,Synset('ballet_skirt.n.01'),garm,-16.872435,1,0,0.0,0.947552,0.578283,4,2,...,0,33,0,0,1,33,8.25,0.333333,0.0,0.0
mess_jacket,Synset('mess_jacket.n.01'),garm,-18.743772,1,0,0.0,1.158120,1.652238,11,3,...,0,0,0,0,0,0,0.00,0.000000,0.0,0.0
long_johns,Synset('long_johns.n.01'),garm,-16.882419,1,0,0.0,1.052836,0.479149,10,1,...,0,0,0,0,0,0,0.00,0.000000,0.0,0.0
undies,Synset('undies.n.01'),garm,-16.330160,1,0,0.0,1.158120,0.280880,6,1,...,0,33,1,1,1,33,9.00,0.000000,0.0,0.0


In [9]:
# split training set and testing set using K-Flod
def new_features_global_model_test(dataset, feature, new_features, target):
    K = 10
    random_seed = 7 # R
    data = dataset.reset_index()
    feature_list = feature + new_features
    X = data[feature_list]
    y = data[target]

    K_Flod = StratifiedKFold(n_splits=K, shuffle=True, random_state=random_seed)
    K_Flod.get_n_splits(X, y)
    cohen_kappa = []
    balanced_acc = []
    for train_index, test_index in K_Flod.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # SMOTE algorithm
        smote = SMOTE(random_state=random_seed, k_neighbors=2)
        X_train, y_train = smote.fit_resample(X_train, y_train)

        # define random forest model
        rf = RandomForestClassifier(random_state=random_seed, max_features='sqrt', n_estimators=1400, min_samples_split=2, min_samples_leaf=1, max_depth=50, oob_score=True, criterion='gini', bootstrap=True).fit(X_train, y_train)

        # predict and make score
        pipeline = make_pipeline(smote, rf)
        y_pred = pipeline.predict(X_test)

        kappa = cohen_kappa_score(y_test, y_pred)
        cohen_kappa.append(kappa)
        balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
        balanced_acc.append(balanced_accuracy)

    results = classification_report(y_test, y_pred, output_dict=True)
    results = pd.DataFrame(results).transpose()

    results['cohen kappa / 10'] = np.mean(cohen_kappa)
    results['balanced acc / 10'] = np.mean(balanced_acc)
    results['global'] = 5

    # importance of features
    importance = rf.feature_importances_
    importance = pd.DataFrame([feature_list, importance]).transpose()
    importance = importance.rename(columns={0:'feature', 1:'importance'}).sort_values('importance', ascending=False)

    return results, importance

In [10]:
# top-down method
new_feature = ['max_1m','max_2_4m','max_5_7m','max_100m','min_1m','min_2_4m','min_5_7m','min_100m','sum_1m','sum_2_4m','sum_5_7m','sum_100m','avg_bnc','avg_childes','avg_cabnc','avg_kbnc']
new_features_global_model_test(data, features, new_feature, target)

(              precision    recall  f1-score    support  cohen kappa / 10  \
 b              0.882353  0.882353  0.882353  17.000000          0.655853   
 nb             0.969697  0.969697  0.969697  66.000000          0.655853   
 accuracy       0.951807  0.951807  0.951807   0.951807          0.655853   
 macro avg      0.926025  0.926025  0.926025  83.000000          0.655853   
 weighted avg   0.951807  0.951807  0.951807  83.000000          0.655853   
 
               balanced acc / 10  global  
 b                       0.84081       5  
 nb                      0.84081       5  
 accuracy                0.84081       5  
 macro avg               0.84081       5  
 weighted avg            0.84081       5  ,
                             feature importance
 4   depthfromtopsynset_normalised_x    0.28202
 5          glosslength_normalised_x   0.111131
 3           nrpartrels_normalised_x   0.104112
 0                  ngrams_last_mean   0.083509
 6                   minwordlength_x 

In [11]:
# remove min_1m
new_feature = ['max_1m','max_2_4m','max_5_7m','max_100m','min_2_4m','min_5_7m','min_100m','sum_1m','sum_2_4m','sum_5_7m','sum_100m','avg_bnc','avg_childes','avg_cabnc','avg_kbnc']
new_features_global_model_test(data, features, new_feature, target)

(              precision    recall  f1-score    support  cohen kappa / 10  \
 b              0.882353  0.882353  0.882353  17.000000          0.661393   
 nb             0.969697  0.969697  0.969697  66.000000          0.661393   
 accuracy       0.951807  0.951807  0.951807   0.951807          0.661393   
 macro avg      0.926025  0.926025  0.926025  83.000000          0.661393   
 weighted avg   0.951807  0.951807  0.951807  83.000000          0.661393   
 
               balanced acc / 10  global  
 b                      0.842291       5  
 nb                     0.842291       5  
 accuracy               0.842291       5  
 macro avg              0.842291       5  
 weighted avg           0.842291       5  ,
                             feature importance
 4   depthfromtopsynset_normalised_x    0.26325
 5          glosslength_normalised_x   0.110669
 3           nrpartrels_normalised_x   0.101248
 0                  ngrams_last_mean   0.083231
 6                   minwordlength_x 

In [16]:
# add bnc_100m_sum
new_feature = ['bnc_100m_sum']
new_features_global_model_test(data, features, new_feature, target)

(              precision    recall  f1-score    support  cohen kappa / 10  \
 b              0.842105  0.941176  0.888889  17.000000          0.714009   
 nb             0.984375  0.954545  0.969231  66.000000          0.714009   
 accuracy       0.951807  0.951807  0.951807   0.951807          0.714009   
 macro avg      0.913240  0.947861  0.929060  83.000000          0.714009   
 weighted avg   0.955235  0.951807  0.952775  83.000000          0.714009   
 
               balanced acc / 10  global  
 b                      0.859528       5  
 nb                     0.859528       5  
 accuracy               0.859528       5  
 macro avg              0.859528       5  
 weighted avg           0.859528       5  ,
                            feature importance
 4  depthfromtopsynset_normalised_x   0.340294
 3          nrpartrels_normalised_x   0.141376
 5         glosslength_normalised_x   0.125519
 0                 ngrams_last_mean   0.117697
 6                  minwordlength_x   0.09

In [19]:
# bottom-up method +1
new_feature = ['max_1m','max_2_4m','max_5_7m','max_100m','min_1m','min_2_4m','min_5_7m','min_100m','sum_1m','sum_2_4m','sum_5_7m','sum_100m','avg_bnc','avg_childes','avg_cabnc','avg_kbnc']
max_kappa = 0.714009
for i in new_feature:
    print("feature: " + i)
    new = ['bnc_100m_sum'] + [i]
    res, f_importance = new_features_global_model_test(data, features, new, target)
    if res['cohen kappa / 10'][0] > max_kappa:
        max_kappa = res['cohen kappa / 10'][0]
        print(max_kappa)
        print(i)
        print(res)
        print(f_importance)

feature: max_1m
0.7092324905618889
max_1m
              precision    recall  f1-score    support  cohen kappa / 10  \
b              0.842105  0.941176  0.888889  17.000000          0.709232   
nb             0.984375  0.954545  0.969231  66.000000          0.709232   
accuracy       0.951807  0.951807  0.951807   0.951807          0.709232   
macro avg      0.913240  0.947861  0.929060  83.000000          0.709232   
weighted avg   0.955235  0.951807  0.952775  83.000000          0.709232   

              balanced acc / 10  global  
b                      0.863851       5  
nb                     0.863851       5  
accuracy               0.863851       5  
macro avg              0.863851       5  
weighted avg           0.863851       5  
                            feature importance
4   depthfromtopsynset_normalised_x   0.332077
5          glosslength_normalised_x   0.126464
3           nrpartrels_normalised_x   0.124124
0                  ngrams_last_mean   0.111875
6             

In [None]:
# bottom-up method +2 (base: min_1m)
new_feature = ['max_1m', 'max_2_4m','max_5_7m','max_100m','min_2_4m','min_5_7m','min_100m','sum_1m','sum_2_4m','sum_5_7m','sum_100m','avg_bnc','avg_childes','avg_cabnc','avg_kbnc']
max_kappa = 0.715145
for i in new_feature:
    print("feature: " + i)
    new = ['bnc_100m_sum', 'min_1m'] + [i]
    res, f_importance = new_features_global_model_test(data, features, new, target)
    if res['cohen kappa / 10'][0] > max_kappa:
        max_kappa = res['cohen kappa / 10'][0]
        print(max_kappa)
        print(i)
        print(res)
        print(f_importance)

feature: max_1m
feature: max_2_4m
feature: max_5_7m
feature: max_100m
feature: min_2_4m
feature: min_5_7m
feature: min_100m
feature: sum_1m
feature: sum_2_4m
feature: sum_5_7m
feature: sum_100m
feature: avg_bnc
feature: avg_childes
feature: avg_cabnc
