In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, cohen_kappa_score, balanced_accuracy_score

import warnings
warnings.filterwarnings("ignore")

In [2]:
# read the pre-processed data all agreed
data = pd.read_csv('./Google_Ngrams/features_google_ngram.csv', index_col=0)

# generate the local dataframe for different domains
local_fruit = data.loc[data['domain_x']=='fruit'].reset_index()
local_tool = data.loc[data['domain_x']=='tool'].reset_index()
local_music = data.loc[data['domain_x']=='music'].reset_index()
local_furniture = data.loc[data['domain_x']=='furn'].reset_index()
local_garments = data.loc[data['domain_x']=='garm'].reset_index()

data

Unnamed: 0_level_0,domain_x,nrdirhypers_x,nrhypos_x,nrpartrels_normalised_x,depthfromtopsynset_normalised_x,glosslength_normalised_x,minwordlength_x,nroflemmas_x,polyscore_max_x,vote_x,ngram_2009_2019_original_sleep,ngram_2009_2019_updata_sleep
Synsets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Synset('adjustable_wrench.n.01'),tool,1,7,0.0,1.012903,0.563173,17,2,1,nb,2.460615e-08,2.460615e-08
Synset('allen_wrench.n.01'),tool,1,0,0.0,1.012903,0.391092,12,1,1,nb,1.601034e-08,1.601034e-08
Synset('alligator_wrench.n.01'),tool,1,0,0.0,1.012903,1.517437,16,1,1,nb,1.236022e-10,1.236022e-10
Synset('awl.n.01'),tool,1,2,15.7,0.911613,0.985552,3,1,1,b,2.805608e-07,2.805608e-07
Synset('backsaw.n.01'),tool,1,0,0.0,1.114194,1.110701,7,2,1,nb,1.135172e-08,1.135172e-08
...,...,...,...,...,...,...,...,...,...,...,...,...
Synset('ballet_skirt.n.01'),garm,1,0,0.0,0.947552,0.578283,4,2,2,nb,5.498723e-07,5.498723e-07
Synset('mess_jacket.n.01'),garm,1,0,0.0,1.158120,1.652238,11,3,1,nb,1.454007e-08,1.454007e-08
Synset('long_johns.n.01'),garm,1,0,0.0,1.052836,0.479149,10,1,1,nb,8.935307e-08,8.935307e-08
Synset('undies.n.01'),garm,1,0,0.0,1.158120,0.280880,6,1,1,nb,2.118603e-07,2.118603e-07


In [3]:
# define features and target
features = ['nrdirhypers_x',
            'nrhypos_x',
            'nrpartrels_normalised_x',
            'depthfromtopsynset_normalised_x',
            'glosslength_normalised_x',
            'minwordlength_x',
            'nroflemmas_x',
            'polyscore_max_x']
target = ['vote_x'] # nb / b

In [4]:
# split training set and testing set using K-Flod
def global_model_test(dataset, feature, sized_corpora, target):
    K = 10
    random_seed = 7 # R
    data = dataset.reset_index()
    feature_list = feature + [sized_corpora]
    X = data[feature_list]
    y = data[target]

    K_Flod = StratifiedKFold(n_splits=K, shuffle=True, random_state=random_seed)
    K_Flod.get_n_splits(X, y)
    cohen_kappa = []
    balanced_acc = []
    for train_index, test_index in K_Flod.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # SMOTE algorithm
        smote = SMOTE(random_state=random_seed, k_neighbors=2)
        X_train, y_train = smote.fit_resample(X_train, y_train)

        # define random forest model
        rf = RandomForestClassifier(random_state=random_seed, max_features='sqrt', n_estimators=1400, min_samples_split=2, min_samples_leaf=1, max_depth=50, oob_score=True, criterion='gini', bootstrap=True).fit(X_train, y_train)

        # predict and make score
        pipeline = make_pipeline(smote, rf)
        y_pred = pipeline.predict(X_test)

        kappa = cohen_kappa_score(y_test, y_pred)
        cohen_kappa.append(kappa)
        balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
        balanced_acc.append(balanced_accuracy)

    results = classification_report(y_test, y_pred, output_dict=True)
    results = pd.DataFrame(results).transpose()

    results['cohen kappa / 10'] = np.mean(cohen_kappa)
    results['balanced acc / 10'] = np.mean(balanced_acc)
    results['global'] = 5

    # importance of features
    importance = rf.feature_importances_
    importance = pd.DataFrame([feature_list, importance]).transpose()
    importance = importance.rename(columns={0:'feature', 1:'importance'}).sort_values('importance', ascending=False)

    return results, importance

## KBNC: 1 million

In [33]:
sized_corpora = 'kbnc_1m_sum'
global_model_test(data, features, sized_corpora, target)

(              precision    recall  f1-score    support  cohen kappa / 10  \
 b              0.866667  0.764706  0.812500  17.000000          0.687594   
 nb             0.941176  0.969697  0.955224  66.000000          0.687594   
 accuracy       0.927711  0.927711  0.927711   0.927711          0.687594   
 macro avg      0.903922  0.867201  0.883862  83.000000          0.687594   
 weighted avg   0.925915  0.927711  0.925991  83.000000          0.687594   
 
               balanced acc / 10  global  
 b                      0.850898       5  
 nb                     0.850898       5  
 accuracy               0.850898       5  
 macro avg              0.850898       5  
 weighted avg           0.850898       5  ,
                            feature importance
 3  depthfromtopsynset_normalised_x   0.432792
 4         glosslength_normalised_x   0.154475
 5                  minwordlength_x   0.122853
 2          nrpartrels_normalised_x   0.098376
 1                        nrhypos_x   0.06

## CABNC: 1 million, 2.4 million

In [34]:
sized_corpora = 'cabnc_per_100k_1m'
global_model_test(data, features, sized_corpora, target)

(              precision    recall  f1-score    support  cohen kappa / 10  \
 b              0.764706  0.764706  0.764706  17.000000          0.674959   
 nb             0.939394  0.939394  0.939394  66.000000          0.674959   
 accuracy       0.903614  0.903614  0.903614   0.903614          0.674959   
 macro avg      0.852050  0.852050  0.852050  83.000000          0.674959   
 weighted avg   0.903614  0.903614  0.903614  83.000000          0.674959   
 
               balanced acc / 10  global  
 b                      0.838758       5  
 nb                     0.838758       5  
 accuracy               0.838758       5  
 macro avg              0.838758       5  
 weighted avg           0.838758       5  ,
                            feature importance
 3  depthfromtopsynset_normalised_x   0.414127
 4         glosslength_normalised_x   0.163589
 5                  minwordlength_x   0.120528
 2          nrpartrels_normalised_x   0.103535
 1                        nrhypos_x   0.05

In [35]:
sized_corpora = 'cabnc_per_100k_2_4m'
global_model_test(data, features, sized_corpora, target)

(              precision    recall  f1-score    support  cohen kappa / 10  \
 b              0.777778  0.823529  0.800000  17.000000          0.687207   
 nb             0.953846  0.939394  0.946565  66.000000          0.687207   
 accuracy       0.915663  0.915663  0.915663   0.915663          0.687207   
 macro avg      0.865812  0.881462  0.873282  83.000000          0.687207   
 weighted avg   0.917784  0.915663  0.916546  83.000000          0.687207   
 
               balanced acc / 10  global  
 b                      0.847429       5  
 nb                     0.847429       5  
 accuracy               0.847429       5  
 macro avg              0.847429       5  
 weighted avg           0.847429       5  ,
                            feature importance
 3  depthfromtopsynset_normalised_x   0.415845
 4         glosslength_normalised_x   0.167009
 5                  minwordlength_x   0.116796
 2          nrpartrels_normalised_x   0.099437
 8              cabnc_per_100k_2_4m   0.05

## CHILDES: 5.7 million, 2.4 million, 1 million

In [36]:
sized_corpora = 'childes_1m_rel_sum'
global_model_test(data, features, sized_corpora, target)

(              precision    recall  f1-score    support  cohen kappa / 10  \
 b              0.833333  0.882353  0.857143  17.000000          0.665358   
 nb             0.969231  0.954545  0.961832  66.000000          0.665358   
 accuracy       0.939759  0.939759  0.939759   0.939759          0.665358   
 macro avg      0.901282  0.918449  0.909487  83.000000          0.665358   
 weighted avg   0.941396  0.939759  0.940390  83.000000          0.665358   
 
               balanced acc / 10  global  
 b                      0.842183       5  
 nb                     0.842183       5  
 accuracy               0.842183       5  
 macro avg              0.842183       5  
 weighted avg           0.842183       5  ,
                            feature importance
 3  depthfromtopsynset_normalised_x   0.410704
 4         glosslength_normalised_x   0.153325
 8               childes_1m_rel_sum   0.119867
 5                  minwordlength_x    0.10246
 2          nrpartrels_normalised_x    0.0

In [37]:
sized_corpora = 'childes_2_4m_rel_sum'
global_model_test(data, features, sized_corpora, target)

(              precision    recall  f1-score    support  cohen kappa / 10  \
 b              0.833333  0.882353  0.857143  17.000000          0.669301   
 nb             0.969231  0.954545  0.961832  66.000000          0.669301   
 accuracy       0.939759  0.939759  0.939759   0.939759          0.669301   
 macro avg      0.901282  0.918449  0.909487  83.000000          0.669301   
 weighted avg   0.941396  0.939759  0.940390  83.000000          0.669301   
 
               balanced acc / 10  global  
 b                      0.843092       5  
 nb                     0.843092       5  
 accuracy               0.843092       5  
 macro avg              0.843092       5  
 weighted avg           0.843092       5  ,
                            feature importance
 3  depthfromtopsynset_normalised_x   0.416273
 4         glosslength_normalised_x   0.160002
 5                  minwordlength_x   0.113281
 8             childes_2_4m_rel_sum   0.094002
 2          nrpartrels_normalised_x   0.07

In [38]:
sized_corpora = 'childes_5_7m_rel_sum'
global_model_test(data, features, sized_corpora, target)

(              precision    recall  f1-score    support  cohen kappa / 10  \
 b              0.833333  0.882353  0.857143  17.000000          0.673659   
 nb             0.969231  0.954545  0.961832  66.000000          0.673659   
 accuracy       0.939759  0.939759  0.939759   0.939759          0.673659   
 macro avg      0.901282  0.918449  0.909487  83.000000          0.673659   
 weighted avg   0.941396  0.939759  0.940390  83.000000          0.673659   
 
               balanced acc / 10  global  
 b                      0.844433       5  
 nb                     0.844433       5  
 accuracy               0.844433       5  
 macro avg              0.844433       5  
 weighted avg           0.844433       5  ,
                            feature importance
 3  depthfromtopsynset_normalised_x   0.412876
 4         glosslength_normalised_x   0.158908
 5                  minwordlength_x   0.115648
 8             childes_5_7m_rel_sum   0.094229
 2          nrpartrels_normalised_x   0.07

## BNC: 100 million, 5.7 million, 2.4 million, 1 million

In [5]:
sized_corpora = 'bnc_1m_sum'
global_model_test(data, features, sized_corpora, target)

(              precision    recall  f1-score    support  cohen kappa / 10  \
 b              0.823529  0.823529  0.823529  17.000000          0.687879   
 nb             0.954545  0.954545  0.954545  66.000000          0.687879   
 accuracy       0.927711  0.927711  0.927711   0.927711          0.687879   
 macro avg      0.889037  0.889037  0.889037  83.000000          0.687879   
 weighted avg   0.927711  0.927711  0.927711  83.000000          0.687879   
 
               balanced acc / 10  global  
 b                      0.846003       5  
 nb                     0.846003       5  
 accuracy               0.846003       5  
 macro avg              0.846003       5  
 weighted avg           0.846003       5  ,
                            feature importance
 3  depthfromtopsynset_normalised_x   0.417566
 4         glosslength_normalised_x    0.15189
 5                  minwordlength_x   0.128142
 2          nrpartrels_normalised_x   0.100405
 1                        nrhypos_x   0.06

In [6]:
sized_corpora = 'bnc_2_4m_sum'
global_model_test(data, features, sized_corpora, target)

(              precision    recall  f1-score    support  cohen kappa / 10  \
 b              0.875000  0.823529  0.848485  17.000000          0.684052   
 nb             0.955224  0.969697  0.962406  66.000000          0.684052   
 accuracy       0.939759  0.939759  0.939759   0.939759          0.684052   
 macro avg      0.915112  0.896613  0.905445  83.000000          0.684052   
 weighted avg   0.938792  0.939759  0.939073  83.000000          0.684052   
 
               balanced acc / 10  global  
 b                      0.849977       5  
 nb                     0.849977       5  
 accuracy               0.849977       5  
 macro avg              0.849977       5  
 weighted avg           0.849977       5  ,
                            feature importance
 3  depthfromtopsynset_normalised_x   0.409364
 4         glosslength_normalised_x    0.15584
 5                  minwordlength_x   0.118867
 2          nrpartrels_normalised_x   0.107318
 8                     bnc_2_4m_sum   0.06

In [6]:
sized_corpora = 'bnc_5_7m_sum'
global_model_test(data, features, sized_corpora, target)

KeyError: "['bnc_5_7m_sum'] not in index"

In [8]:
sized_corpora = 'bnc_100m_sum'
global_model_test(data, features, sized_corpora, target)

(              precision    recall  f1-score    support  cohen kappa / 10  \
 b              0.833333  0.882353  0.857143  17.000000          0.686256   
 nb             0.969231  0.954545  0.961832  66.000000          0.686256   
 accuracy       0.939759  0.939759  0.939759   0.939759          0.686256   
 macro avg      0.901282  0.918449  0.909487  83.000000          0.686256   
 weighted avg   0.941396  0.939759  0.940390  83.000000          0.686256   
 
               balanced acc / 10  global  
 b                      0.847429       5  
 nb                     0.847429       5  
 accuracy               0.847429       5  
 macro avg              0.847429       5  
 weighted avg           0.847429       5  ,
                            feature importance
 3  depthfromtopsynset_normalised_x    0.38639
 2          nrpartrels_normalised_x   0.139271
 4         glosslength_normalised_x   0.137614
 5                  minwordlength_x   0.107792
 8                     bnc_100m_sum   0.10

## Google Ngram

In [7]:
sized_corpora = 'ngram_2009_2019_updata_sleep'
global_model_test(data, features, sized_corpora, target)

(              precision    recall  f1-score    support  cohen kappa / 10  \
 b              0.777778  0.823529  0.800000  17.000000          0.688993   
 nb             0.953846  0.939394  0.946565  66.000000          0.688993   
 accuracy       0.915663  0.915663  0.915663   0.915663          0.688993   
 macro avg      0.865812  0.881462  0.873282  83.000000          0.688993   
 weighted avg   0.917784  0.915663  0.916546  83.000000          0.688993   
 
               balanced acc / 10  global  
 b                       0.84584       5  
 nb                      0.84584       5  
 accuracy                0.84584       5  
 macro avg               0.84584       5  
 weighted avg            0.84584       5  ,
                            feature importance
 3  depthfromtopsynset_normalised_x   0.418358
 4         glosslength_normalised_x   0.162561
 5                  minwordlength_x   0.122179
 2          nrpartrels_normalised_x   0.077335
 8     ngram_2009_2019_updata_sleep   0.07

## Test: next stage feature engineering

In [7]:
# read the pre-processed data all agreed
data = pd.read_csv('./next_stage_size_differential_features.csv', index_col=0)

# generate the local dataframe for different domains
local_fruit = data.loc[data['domain_x']=='fruit'].reset_index()
local_tool = data.loc[data['domain_x']=='tool'].reset_index()
local_music = data.loc[data['domain_x']=='music'].reset_index()
local_furniture = data.loc[data['domain_x']=='furn'].reset_index()
local_garments = data.loc[data['domain_x']=='garm'].reset_index()

data

Unnamed: 0_level_0,Synsets,domain_x,ngrams_last_mean,nrdirhypers_x,nrhypos_x,nrpartrels_normalised_x,depthfromtopsynset_normalised_x,glosslength_normalised_x,minwordlength_x,nroflemmas_x,...,min_5_7m,min_100m,sum_1m,sum_2_4m,sum_5_7m,sum_100m,avg_bnc,avg_childes,avg_cabnc,avg_kbnc
norm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
adjustable_wrench,Synset('adjustable_wrench.n.01'),tool,-18.854402,1,7,0.0,1.012903,0.563173,17,2,...,0,0,0,0,0,0,0.00,0.000000,0.0,0.0
allen_wrench,Synset('allen_wrench.n.01'),tool,-18.389126,1,0,0.0,1.012903,0.391092,12,1,...,0,0,0,0,0,0,0.00,0.000000,0.0,0.0
alligator_wrench,Synset('alligator_wrench.n.01'),tool,-22.354690,1,0,0.0,1.012903,1.517437,16,1,...,0,0,0,0,0,0,0.00,0.000000,0.0,0.0
awl,Synset('awl.n.01'),tool,-14.809997,1,2,15.7,0.911613,0.985552,3,1,...,0,39,1,1,3,39,11.00,0.000000,0.0,0.0
backsaw,Synset('backsaw.n.01'),tool,-18.327476,1,0,0.0,1.114194,1.110701,7,2,...,0,0,0,0,0,0,0.00,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ballet_skirt,Synset('ballet_skirt.n.01'),garm,-16.872435,1,0,0.0,0.947552,0.578283,4,2,...,0,33,0,0,1,33,8.25,0.333333,0.0,0.0
mess_jacket,Synset('mess_jacket.n.01'),garm,-18.743772,1,0,0.0,1.158120,1.652238,11,3,...,0,0,0,0,0,0,0.00,0.000000,0.0,0.0
long_johns,Synset('long_johns.n.01'),garm,-16.882419,1,0,0.0,1.052836,0.479149,10,1,...,0,0,0,0,0,0,0.00,0.000000,0.0,0.0
undies,Synset('undies.n.01'),garm,-16.330160,1,0,0.0,1.158120,0.280880,6,1,...,0,33,1,1,1,33,9.00,0.000000,0.0,0.0


In [6]:
# split training set and testing set using K-Flod
def new_features_global_model_test(dataset, feature, new_features, target):
    K = 10
    random_seed = 7 # R
    data = dataset.reset_index()
    feature_list = feature + new_features
    X = data[feature_list]
    y = data[target]

    K_Flod = StratifiedKFold(n_splits=K, shuffle=True, random_state=random_seed)
    K_Flod.get_n_splits(X, y)
    cohen_kappa = []
    balanced_acc = []
    for train_index, test_index in K_Flod.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # SMOTE algorithm
        smote = SMOTE(random_state=random_seed, k_neighbors=2)
        X_train, y_train = smote.fit_resample(X_train, y_train)

        # define random forest model
        rf = RandomForestClassifier(random_state=random_seed, max_features='sqrt', n_estimators=1400, min_samples_split=2, min_samples_leaf=1, max_depth=50, oob_score=True, criterion='gini', bootstrap=True).fit(X_train, y_train)

        # predict and make score
        pipeline = make_pipeline(smote, rf)
        y_pred = pipeline.predict(X_test)

        kappa = cohen_kappa_score(y_test, y_pred)
        cohen_kappa.append(kappa)
        balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
        balanced_acc.append(balanced_accuracy)

    results = classification_report(y_test, y_pred, output_dict=True)
    results = pd.DataFrame(results).transpose()

    results['cohen kappa / 10'] = np.mean(cohen_kappa)
    results['balanced acc / 10'] = np.mean(balanced_acc)
    results['global'] = 5

    # importance of features
    importance = rf.feature_importances_
    importance = pd.DataFrame([feature_list, importance]).transpose()
    importance = importance.rename(columns={0:'feature', 1:'importance'}).sort_values('importance', ascending=False)

    return results, importance

## feature selection

In [45]:
# top-down method
new_feature = ['max_1m','max_2_4m','max_5_7m','max_100m','min_1m','min_2_4m','min_5_7m','min_100m','sum_1m','sum_2_4m','sum_5_7m','sum_100m','avg_bnc','avg_childes','avg_cabnc','avg_kbnc']
new_features_global_model_test(data, features, new_feature, target)

(              precision    recall  f1-score    support  cohen kappa / 10  \
 b              0.823529  0.823529  0.823529  17.000000          0.638242   
 nb             0.954545  0.954545  0.954545  66.000000          0.638242   
 accuracy       0.927711  0.927711  0.927711   0.927711          0.638242   
 macro avg      0.889037  0.889037  0.889037  83.000000          0.638242   
 weighted avg   0.927711  0.927711  0.927711  83.000000          0.638242   
 
               balanced acc / 10  global  
 b                      0.831217       5  
 nb                     0.831217       5  
 accuracy               0.831217       5  
 macro avg              0.831217       5  
 weighted avg           0.831217       5  ,
                             feature importance
 3   depthfromtopsynset_normalised_x   0.276916
 4          glosslength_normalised_x   0.111362
 2           nrpartrels_normalised_x    0.10688
 5                   minwordlength_x    0.07629
 20                          avg_bnc 

In [46]:
# remove min_1m
new_feature = ['max_1m','max_2_4m','max_5_7m','max_100m','min_2_4m','min_5_7m','min_100m','sum_1m','sum_2_4m','sum_5_7m','sum_100m','avg_bnc','avg_childes','avg_cabnc','avg_kbnc']
new_features_global_model_test(data, features, new_feature, target)

(              precision    recall  f1-score    support  cohen kappa / 10  \
 b              0.875000  0.823529  0.848485  17.000000          0.642998   
 nb             0.955224  0.969697  0.962406  66.000000          0.642998   
 accuracy       0.939759  0.939759  0.939759   0.939759          0.642998   
 macro avg      0.915112  0.896613  0.905445  83.000000          0.642998   
 weighted avg   0.938792  0.939759  0.939073  83.000000          0.642998   
 
               balanced acc / 10  global  
 b                      0.830526       5  
 nb                     0.830526       5  
 accuracy               0.830526       5  
 macro avg              0.830526       5  
 weighted avg           0.830526       5  ,
                             feature importance
 3   depthfromtopsynset_normalised_x   0.280757
 4          glosslength_normalised_x   0.116417
 2           nrpartrels_normalised_x   0.104133
 5                   minwordlength_x   0.075361
 18                         sum_100m 

In [48]:
# bottom-up method +1
new_feature = ['max_1m','max_2_4m','max_5_7m','max_100m','min_1m','min_2_4m','min_5_7m','min_100m','sum_1m','sum_2_4m','sum_5_7m','sum_100m','avg_bnc','avg_childes','avg_cabnc','avg_kbnc']
max_kappa = 0
for i in new_feature:
    print("feature: " + i)
    new = ['bnc_100m_sum'] + [i]
    res, f_importance = new_features_global_model_test(data, features, new, target)
    if res['cohen kappa / 10'][0] > max_kappa:
        max_kappa = res['cohen kappa / 10'][0]
        print(max_kappa)
        print(i)
        print(res)
        print(f_importance)

feature: max_1m
0.6992007014562882
max_1m
              precision    recall  f1-score    support  cohen kappa / 10  \
b              0.882353  0.882353  0.882353  17.000000          0.699201   
nb             0.969697  0.969697  0.969697  66.000000          0.699201   
accuracy       0.951807  0.951807  0.951807   0.951807          0.699201   
macro avg      0.926025  0.926025  0.926025  83.000000          0.699201   
weighted avg   0.951807  0.951807  0.951807  83.000000          0.699201   

              balanced acc / 10  global  
b                      0.857515       5  
nb                     0.857515       5  
accuracy               0.857515       5  
macro avg              0.857515       5  
weighted avg           0.857515       5  
                           feature importance
3  depthfromtopsynset_normalised_x   0.361696
4         glosslength_normalised_x   0.134097
2          nrpartrels_normalised_x   0.132681
5                  minwordlength_x   0.105417
8                  

In [49]:
# bottom-up method +2 (base: avg_bnc)
new_feature = ['max_1m', 'max_2_4m','max_5_7m','max_100m','min_2_4m','min_5_7m','min_100m','sum_1m','sum_2_4m','sum_5_7m','sum_100m','avg_childes','avg_cabnc','avg_kbnc']
max_kappa = 0.706174
for i in new_feature:
    print("feature: " + i)
    new = ['bnc_100m_sum', 'avg_bnc'] + [i]
    res, f_importance = new_features_global_model_test(data, features, new, target)
    if res['cohen kappa / 10'][0] > max_kappa:
        max_kappa = res['cohen kappa / 10'][0]
        print(max_kappa)
        print(i)
        print(res)
        print(f_importance)

feature: max_1m
0.7079283480806089
max_1m
              precision    recall  f1-score    support  cohen kappa / 10  \
b              0.888889  0.941176  0.914286  17.000000          0.707928   
nb             0.984615  0.969697  0.977099  66.000000          0.707928   
accuracy       0.963855  0.963855  0.963855   0.963855          0.707928   
macro avg      0.936752  0.955437  0.945692  83.000000          0.707928   
weighted avg   0.965009  0.963855  0.964234  83.000000          0.707928   

              balanced acc / 10  global  
b                      0.865277       5  
nb                     0.865277       5  
accuracy               0.865277       5  
macro avg              0.865277       5  
weighted avg           0.865277       5  
                            feature importance
3   depthfromtopsynset_normalised_x   0.335542
2           nrpartrels_normalised_x   0.131624
4          glosslength_normalised_x   0.130683
5                   minwordlength_x   0.099291
8             

In [8]:
# bottom-up method +3 (base: avg_bnc, max_1m)
new_feature = ['max_2_4m','max_5_7m','max_100m','min_2_4m','min_5_7m','min_100m','sum_1m','sum_2_4m','sum_5_7m','sum_100m','avg_childes','avg_cabnc','avg_kbnc']
max_kappa = 0.707928
for i in new_feature:
    print("feature: " + i)
    new = ['bnc_100m_sum', 'avg_bnc', 'max_1m'] + [i]
    res, f_importance = new_features_global_model_test(data, features, new, target)
    if res['cohen kappa / 10'][0] > max_kappa:
        max_kappa = res['cohen kappa / 10'][0]
        print(max_kappa)
        print(i)
        print(res)
        print(f_importance)

feature: max_2_4m
feature: max_5_7m
feature: max_100m
feature: min_2_4m
feature: min_5_7m
feature: min_100m
feature: sum_1m
feature: sum_2_4m
feature: sum_5_7m
feature: sum_100m
feature: avg_childes
feature: avg_cabnc
feature: avg_kbnc
