In [1]:
import pylangacq
import pandas as pd
import numpy as np
from nltk.corpus import wordnet
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, cohen_kappa_score, balanced_accuracy_score
from nltk.corpus.reader.bnc import BNCCorpusReader

import warnings
warnings.filterwarnings("ignore")

In [2]:
# only once run for download WordNet or update
import nltk
# nltk.download('wordnet', download_dir='./')
nltk.data.path.append('../../')

In [3]:
# inherit features from Gold Standard dataset
GS_all_agreed = pd.read_csv('../../sampled_count/GS_All_Agreed.csv', index_col=0)
features_target = ['Synsets','domain_x',
                   'nrdirhypers_x',
                   'nrhypos_x',
                   'nrpartrels_normalised_x',
                   'depthfromtopsynset_normalised_x',
                   'glosslength_normalised_x',
                   'minwordlength_x',
                   'nroflemmas_x',
                   'polyscore_max_x',
                   'vote_x']
GS_adopt = GS_all_agreed[features_target]

# extract norms from synsets
GS_adopt['norm'] = GS_adopt['Synsets'].str.split("'").str[1].str.split('.').str[0]
GS_adopt = GS_adopt.set_index('norm').reset_index()

In [4]:
# matching norms with corpora
def sum_lemmas(norm, corpora):
    # search norm in WordNet
    synsets_list = wordnet.synsets(norm)
    lemmas = []
    feq_count = 0
    for synset in synsets_list:
        # extract lemmas from every synset
        lemmas += [str(lemma.name()) for lemma in synset.lemmas()]
    for lemma in lemmas:
        # check each lemma in corpora
        feq_count += corpora.loc[lemma == corpora.index].to_numpy().sum()

    return feq_count

In [5]:
# define features and target
features = ['nrdirhypers_x',
            'nrhypos_x',
            'nrpartrels_normalised_x',
            'depthfromtopsynset_normalised_x',
            'glosslength_normalised_x',
            'minwordlength_x',
            'nroflemmas_x',
            'polyscore_max_x']
target = ['vote_x'] # nb / b

In [10]:
# split training set and testing set using K-Flod
def local_model_test(dataset, feature, sized_corpora, target):
    K = 10
    random_seed = 7 # R
    data = dataset.reset_index()
    feature_list = feature + [sized_corpora]
    X = data[feature_list]
    y = data[target]

    K_Flod = StratifiedKFold(n_splits=K, shuffle=True, random_state=random_seed)
    K_Flod.get_n_splits(X, y)
    cohen_kappa = []
    balanced_acc = []
    for train_index, test_index in K_Flod.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # SMOTE algorithm
        smote = SMOTE(random_state=random_seed, k_neighbors=2)
        X_train, y_train = smote.fit_resample(X_train, y_train)

        # define random forest model
        rf = RandomForestClassifier(random_state=random_seed, max_features='sqrt', n_estimators=1400, min_samples_split=2, min_samples_leaf=1, max_depth=50, oob_score=True, criterion='gini', bootstrap=True).fit(X_train, y_train)

        # predict and make score
        pipeline = make_pipeline(smote, rf)
        y_pred = pipeline.predict(X_test)

        kappa = cohen_kappa_score(y_test, y_pred)
        cohen_kappa.append(kappa)
        balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
        balanced_acc.append(balanced_accuracy)

    results = classification_report(y_test, y_pred, output_dict=True)
    results = pd.DataFrame(results).transpose()

    results['cohen kappa / 10'] = np.mean(cohen_kappa)
    results['balanced acc / 10'] = np.mean(balanced_acc)
    results['global'] = 5

    # importance of features
    # importance = rf.feature_importances_
    # importance = pd.DataFrame([features, importance]).transpose()
    # importance = importance.rename(columns={0:'feature', 1:'importance'}).sort_values('importance', ascending=False)

    importance = pd.DataFrame() # no importance
    return results, importance

In [12]:
# generate the local dataframe for different domains
local_fruit = GS_adopt.loc[GS_adopt['domain_x']=='fruit']
local_tool = GS_adopt.loc[GS_adopt['domain_x']=='tool']
local_music = GS_adopt.loc[GS_adopt['domain_x']=='music']
local_furniture = GS_adopt.loc[GS_adopt['domain_x']=='furn']
local_garments = GS_adopt.loc[GS_adopt['domain_x']=='garm']

local_list = [local_fruit, local_tool, local_music, local_furniture, local_garments]

0.6404582598586586

## KBNC by 50 times samplings corpus

In [7]:
# extract all child audience from BNC, known as KBNC (according to ./BNC_WORLD_INDEX.XLS)
aud_child = ['A/A7/A7A.xml','A/AB/ABX.xml','A/AC/AC5.xml','A/AC/AC4.xml','A/AC/ACB.xml','A/AC/ACV.xml','A/AE/AEB.xml','A/AL/ALS.xml','A/AM/AMB.xml','A/AP/APW.xml','A/AT/AT4.xml','B/B0/B0B.xml','B/B1/B1S.xml','B/B2/B2V.xml','B/B2/B2N.xml','B/BM/BMU.xml','B/BP/BPD.xml','C/C8/C85.xml','C/C8/C8N.xml','C/CA/CA3.xml','C/CA/CAB.xml','C/CA/CAX.xml','C/CC/CCA.xml','C/CE/CE0.xml','C/CE/CEU.xml','C/CF/CFJ.xml','C/CH/CH4.xml','C/CH/CH9.xml','C/CH/CHR.xml','E/EF/EFJ.xml','F/FE/FEH.xml','F/FN/FNS.xml','F/FP/FPT.xml','F/FP/FPV.xml','F/FS/FSL.xml','F/FU/FUB.xml','G/G2/G22.xml','G/G2/G23.xml','G/G2/G24.xml','G/G2/G25.xml','H/H9/H93.xml','H/H9/H9E.xml']
kbnc_reader = BNCCorpusReader(root='../../corpora/BNC/2554.zip/download/Texts/', fileids=aud_child)
kbnc = kbnc_reader.words()

kbnc_df = pd.Series(kbnc)
random_seed = 7

In [14]:
# define a function to perform several times sampling
def multiSample_KBNC(n):
    result_dict = {}
    for time in range(n):
        kbnc_1m = kbnc_df.sample(n=1000000)

        # create a word count from dictionary into dataframe
        # addition: convert all words into low case
        kbnc_1m_freq = nltk.FreqDist(word.lower() for word in kbnc_1m)
        kbnc_1m_df = pd.DataFrame.from_dict(kbnc_1m_freq, orient='index').reset_index().rename(columns={0: 'KBNC_Count', 'index':'norm'}).set_index('norm')
        GS_adopt['kbnc_1m_sum'] = GS_adopt['norm'].apply(lambda norm: sum_lemmas(norm, kbnc_1m_df))

        # generate the local dataframe for different domains
        local_fruit = GS_adopt.loc[GS_adopt['domain_x']=='fruit']
        local_tool = GS_adopt.loc[GS_adopt['domain_x']=='tool']
        local_music = GS_adopt.loc[GS_adopt['domain_x']=='music']
        local_furniture = GS_adopt.loc[GS_adopt['domain_x']=='furn']
        local_garments = GS_adopt.loc[GS_adopt['domain_x']=='garm']

        # LocalModel test
        result_dict[time] = {'1M':{'fruit':{}, 'tool':{}, 'music':{}, 'furniture':{}, 'garments':{}}}
        sized_corpora = 'kbnc_1m_sum'
        result_kappa_acc, result_importance = local_model_test(local_fruit, features, sized_corpora, target)
        result_dict[time]['1M']['fruit']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['1M']['fruit']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_tool, features, sized_corpora, target)
        result_dict[time]['1M']['tool']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['1M']['tool']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_music, features, sized_corpora, target)
        result_dict[time]['1M']['music']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['1M']['music']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_furniture, features, sized_corpora, target)
        result_dict[time]['1M']['furniture']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['1M']['furniture']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_garments, features, sized_corpora, target)
        result_dict[time]['1M']['garments']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['1M']['garments']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

    return result_dict

In [17]:
multiResult = multiSample_KBNC(50)

In [18]:
multiResult

{0: {'1M': {'fruit': {'kappa': 0.864030487787879, 'balanced acc': 0.925},
   'tool': {'kappa': 0.8147334273749367, 'balanced acc': 0.9184848484848486},
   'music': {'kappa': 0.6130535499824552, 'balanced acc': 0.8082142857142858},
   'furniture': {'kappa': 0.37095950208820827,
    'balanced acc': 0.7102941176470587},
   'garments': {'kappa': 0.48134042144323247,
    'balanced acc': 0.7627705627705628}}},
 1: {'1M': {'fruit': {'kappa': 0.864030487787879, 'balanced acc': 0.925},
   'tool': {'kappa': 0.7902051254881443, 'balanced acc': 0.901818181818182},
   'music': {'kappa': 0.6129508823797877, 'balanced acc': 0.8082142857142858},
   'furniture': {'kappa': 0.4300941174728237,
    'balanced acc': 0.7415441176470587},
   'garments': {'kappa': 0.48134042144323247,
    'balanced acc': 0.7627705627705628}}},
 2: {'1M': {'fruit': {'kappa': 0.864030487787879, 'balanced acc': 0.925},
   'tool': {'kappa': 0.8147334273749367, 'balanced acc': 0.9184848484848486},
   'music': {'kappa': 0.6130535499

In [19]:
np.save('LocalModel_50_times_sampling_KBNC.npy', multiResult, allow_pickle=True)

In [21]:
# integrate kappa / balanced acc
multiResult_integration = multiResult

domains = ['fruit', 'tool', 'music', 'furniture', 'garments']

for k, v in multiResult_integration.items():
    kappa_1m_list = []
    balancedacc_1m_list = []
    kappa_2_4m_list = []
    balancedacc_2_4m_list = []
    for domain in domains:
        kappa_1m_list.append(v['1M'][domain]['kappa'])
        balancedacc_1m_list.append(v['1M'][domain]['balanced acc'])
    kappa_avg_1m = np.mean(kappa_1m_list)
    balancedacc_avg_1m = np.mean(balancedacc_1m_list)

    multiResult_integration[k]['1M'] = {'kappa': kappa_avg_1m, 'balanced acc': balancedacc_avg_1m}

multiResult_integration

{0: {'1M': {'kappa': 0.6288234777353423, 'balanced acc': 0.8249527629233512}},
 1: {'1M': {'kappa': 0.6357242069143735, 'balanced acc': 0.8278694295900177}},
 2: {'1M': {'kappa': 0.6405351385913753, 'balanced acc': 0.830861853832442}},
 3: {'1M': {'kappa': 0.6268145076242393, 'balanced acc': 0.8253694295900178}},
 4: {'1M': {'kappa': 0.6422265767828136, 'balanced acc': 0.8329451871657755}},
 5: {'1M': {'kappa': 0.6273696315814962, 'balanced acc': 0.8249527629233512}},
 6: {'1M': {'kappa': 0.6165623005487127, 'balanced acc': 0.8170739750445634}},
 7: {'1M': {'kappa': 0.6332391514111172, 'balanced acc': 0.8240084988540872}},
 8: {'1M': {'kappa': 0.6336177590296236, 'balanced acc': 0.8282860962566845}},
 9: {'1M': {'kappa': 0.6291574648329255, 'balanced acc': 0.8247444295900179}},
 10: {'1M': {'kappa': 0.6301952636354302, 'balanced acc': 0.8233456200662083}},
 11: {'1M': {'kappa': 0.6340351364718081, 'balanced acc': 0.8260323083778965}},
 12: {'1M': {'kappa': 0.6340351364718081, 'balanced

In [22]:
np.save('LocalModel_inte_50_times_sampling_KBNC.npy', multiResult_integration, allow_pickle=True)

## CABNC by 50 times sampling corpus

In [None]:
CABNC_full = pylangacq.read_chat('../../corpora/CABNC/CABNC.zip')
words = CABNC_full.words()
cabnc_df = pd.Series(words)
random_seed = 7

In [26]:
# define a function to perform several times sampling
def multiSample_CABNC(n):
    result_dict = {}
    for time in range(n):
        cabnc_2_4m = cabnc_df.sample(n=2400000)
        cabnc_1m = cabnc_df.sample(n=1000000)

        # create a word count from dictionary into dataframe
        # addition: convert all words into low case
        cabnc_2_4m_freq = nltk.FreqDist(word.lower() for word in cabnc_2_4m)
        cabnc_2_4m_df = pd.DataFrame.from_dict(cabnc_2_4m_freq, orient='index').reset_index().rename(columns={0: 'CABNC_Count', 'index':'norm'}).set_index('norm')
        cabnc_1m_freq = nltk.FreqDist(word.lower() for word in cabnc_1m)
        cabnc_1m_df = pd.DataFrame.from_dict(cabnc_1m_freq, orient='index').reset_index().rename(columns={0: 'CABNC_Count', 'index':'norm'}).set_index('norm')

        # cabnc_per_100k: The frequency occurence of all lemmas per synset, per 100,000 words of the CABNC
        GS_adopt['cabnc_per_100k_2_4m'] = GS_adopt['norm'].apply(lambda norm: (sum_lemmas(norm, cabnc_2_4m_df)/2400000)*100000)
        GS_adopt['cabnc_per_100k_1m'] = GS_adopt['norm'].apply(lambda norm: (sum_lemmas(norm, cabnc_1m_df)/1000000)*100000)

        # generate the local dataframe for different domains
        local_fruit = GS_adopt.loc[GS_adopt['domain_x']=='fruit']
        local_tool = GS_adopt.loc[GS_adopt['domain_x']=='tool']
        local_music = GS_adopt.loc[GS_adopt['domain_x']=='music']
        local_furniture = GS_adopt.loc[GS_adopt['domain_x']=='furn']
        local_garments = GS_adopt.loc[GS_adopt['domain_x']=='garm']

        # LocalModel test
        result_dict[time] = {'1M':{'fruit':{}, 'tool':{}, 'music':{}, 'furniture':{}, 'garments':{}}, '2_4M':{'fruit':{}, 'tool':{}, 'music':{}, 'furniture':{}, 'garments':{}}}
        sized_corpora = 'cabnc_per_100k_1m'
        result_kappa_acc, result_importance = local_model_test(local_fruit, features, sized_corpora, target)
        result_dict[time]['1M']['fruit']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['1M']['fruit']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_tool, features, sized_corpora, target)
        result_dict[time]['1M']['tool']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['1M']['tool']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_music, features, sized_corpora, target)
        result_dict[time]['1M']['music']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['1M']['music']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_furniture, features, sized_corpora, target)
        result_dict[time]['1M']['furniture']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['1M']['furniture']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_garments, features, sized_corpora, target)
        result_dict[time]['1M']['garments']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['1M']['garments']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]


        sized_corpora = 'cabnc_per_100k_2_4m'
        result_kappa_acc, result_importance = local_model_test(local_fruit, features, sized_corpora, target)
        result_dict[time]['2_4M']['fruit']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['2_4M']['fruit']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_tool, features, sized_corpora, target)
        result_dict[time]['2_4M']['tool']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['2_4M']['tool']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_music, features, sized_corpora, target)
        result_dict[time]['2_4M']['music']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['2_4M']['music']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_furniture, features, sized_corpora, target)
        result_dict[time]['2_4M']['furniture']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['2_4M']['furniture']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_garments, features, sized_corpora, target)
        result_dict[time]['2_4M']['garments']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['2_4M']['garments']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

    return result_dict

In [27]:
multiResult = multiSample_CABNC(50)

In [61]:
np.save('LocalModel_50_times_sampling_CABNC.npy', multiResult, allow_pickle=True)

In [62]:
multiResult = np.load('LocalModel_50_times_sampling_CABNC.npy', allow_pickle=True)
multiResult.item()

In [63]:
# integrate kappa / balanced acc
multiResult_integration = multiResult.item()

domains = ['fruit', 'tool', 'music', 'furniture', 'garments']

for k, v in multiResult_integration.items():
    kappa_1m_list = []
    balancedacc_1m_list = []
    kappa_2_4m_list = []
    balancedacc_2_4m_list = []
    for domain in domains:
        kappa_1m_list.append(v['1M'][domain]['kappa'])
        balancedacc_1m_list.append(v['1M'][domain]['balanced acc'])
        kappa_2_4m_list.append(v['2_4M'][domain]['kappa'])
        balancedacc_2_4m_list.append(v['2_4M'][domain]['balanced acc'])
    kappa_avg_1m = np.mean(kappa_1m_list)
    balancedacc_avg_1m = np.mean(balancedacc_1m_list)
    kappa_avg_2_4m = np.mean(kappa_2_4m_list)
    balancedacc_avg_2_4m = np.mean(balancedacc_2_4m_list)

    multiResult_integration[k]['1M'] = {'kappa': kappa_avg_1m, 'balanced acc': balancedacc_avg_1m}
    multiResult_integration[k]['2_4M'] = {'kappa': kappa_avg_2_4m, 'balanced acc': balancedacc_avg_2_4m}

multiResult_integration

{0: {'1M': {'fruit': {'kappa': 0.8963526725601545,
    'balanced acc': 0.9433333333333334},
   'tool': {'kappa': 0.7902051254881443, 'balanced acc': 0.901818181818182},
   'music': {'kappa': 0.5400187906241665, 'balanced acc': 0.7694642857142857},
   'furniture': {'kappa': 0.41717104054974674,
    'balanced acc': 0.7196691176470588},
   'garments': {'kappa': 0.5492128056896517,
    'balanced acc': 0.7886363636363636}},
  '2_4M': {'fruit': {'kappa': 0.8963526725601545,
    'balanced acc': 0.9433333333333334},
   'tool': {'kappa': 0.8137104489934679, 'balanced acc': 0.9184848484848486},
   'music': {'kappa': 0.5918317074370834, 'balanced acc': 0.7957142857142857},
   'furniture': {'kappa': 0.43597273524794533,
    'balanced acc': 0.7386029411764705},
   'garments': {'kappa': 0.49662785173066276,
    'balanced acc': 0.7698051948051947}}},
 1: {'1M': {'fruit': {'kappa': 0.8963526725601545,
    'balanced acc': 0.9433333333333334},
   'tool': {'kappa': 0.7902051254881443, 'balanced acc': 0.9

In [66]:
np.save('LocalModel_inte_50_times_sampling_CABNC.npy', multiResult_integration, allow_pickle=True)

{0: {'1M': {'kappa': 0.6385920869823727, 'balanced acc': 0.8245842564298446},
  '2_4M': {'kappa': 0.6468990831938627, 'balanced acc': 0.8331881207028266}},
 1: {'1M': {'kappa': 0.6350067738575469, 'balanced acc': 0.8243634772090654},
  '2_4M': {'kappa': 0.6463206338654135, 'balanced acc': 0.833336930226636}},
 2: {'1M': {'kappa': 0.6471206338654134, 'balanced acc': 0.8316702635599695},
  '2_4M': {'kappa': 0.6437244800192595, 'balanced acc': 0.8327119302266361}},
 3: {'1M': {'kappa': 0.6434267587068325, 'balanced acc': 0.8316323847720908},
  '2_4M': {'kappa': 0.6437244800192595, 'balanced acc': 0.8327119302266361}},
 4: {'1M': {'kappa': 0.6328970945873802, 'balanced acc': 0.8228559014514897},
  '2_4M': {'kappa': 0.6437244800192595, 'balanced acc': 0.8327119302266361}},
 5: {'1M': {'kappa': 0.6440296486951275, 'balanced acc': 0.8281475681181563},
  '2_4M': {'kappa': 0.6437244800192595, 'balanced acc': 0.8327119302266361}},
 6: {'1M': {'kappa': 0.6554302107497973, 'balanced acc': 0.834267

## CHILDES by 50 times sampling corpus

In [None]:
# load sub-corporas
brown = pylangacq.read_chat("../../corpora/CHILDES/Brown.zip")
belf = pylangacq.read_chat("../../corpora/CHILDES/Belfast.zip")
crutt=pylangacq.read_chat("../../corpora/CHILDES/Cruttenden.zip")
fletcher=pylangacq.read_chat("../../corpora/CHILDES/Fletcher.zip")
forr=pylangacq.read_chat("../../corpora/CHILDES/Forrester.zip")
gath=pylangacq.read_chat("../../corpora/CHILDES/Gathburn.zip")
howe=pylangacq.read_chat("../../corpora/CHILDES/Howe.zip")
kelly=pylangacq.read_chat("../../corpora/CHILDES/KellyQuigley.zip")
korman=pylangacq.read_chat("../../corpora/CHILDES/Korman.zip")
lara=pylangacq.read_chat("../../corpora/CHILDES/Lara.zip")
manc=pylangacq.read_chat("../../corpora/CHILDES/Manchester.zip")
nuff=pylangacq.read_chat("../../corpora/CHILDES/Nuffield.zip")
quigley=pylangacq.read_chat("../../corpora/CHILDES/QuigleyMcNally.zip")
sekali=pylangacq.read_chat("../../corpora/CHILDES/Sekali.zip")
smith=pylangacq.read_chat("../../corpora/CHILDES/Smith.zip")
tommer=pylangacq.read_chat("../../corpora/CHILDES/Tommerdahl.zip")

# link up sub-corporas
corpora = [brown, belf, crutt, fletcher, forr, gath, howe, kelly, korman, lara, manc, nuff, quigley, sekali, smith, tommer]
reader = pylangacq.Reader()
for item in corpora:
    reader.append(item)

# preparation sample for 5.7m, 2.4m, 1m
words = reader.words()
childes_df = pd.Series(words)
random_seed = 7

In [73]:
# define a function to perform several times sampling
def multiSample_CHILDES(n):
    result_dict = {}
    for time in range(n):
        childes_5_7m = childes_df.sample(n=5700000, replace=True)
        childes_2_4m = childes_df.sample(n=2400000)
        childes_1m = childes_df.sample(n=1000000)

        # create a word count from dictionary into dataframe
        # addition: convert all words into low case
        childes_5_7m_freq = nltk.FreqDist(word.lower() for word in childes_5_7m)
        childes_5_7m_df = pd.DataFrame.from_dict(childes_5_7m_freq, orient='index').reset_index().rename(columns={0: 'CHILDES_Count', 'index':'norm'}).set_index('norm')
        childes_2_4m_freq = nltk.FreqDist(word.lower() for word in childes_2_4m)
        childes_2_4m_df = pd.DataFrame.from_dict(childes_2_4m_freq, orient='index').reset_index().rename(columns={0: 'CHILDES_Count', 'index':'norm'}).set_index('norm')
        childes_1m_freq = nltk.FreqDist(word.lower() for word in childes_1m)
        childes_1m_df = pd.DataFrame.from_dict(childes_1m_freq, orient='index').reset_index().rename(columns={0: 'CHILDES_Count', 'index':'norm'}).set_index('norm')

        # childes_rel_sum: The sum of all instances of each lemma per synset in the CHILDES corpus, devided by the total number of words in the corpus
        total_count_1m = childes_1m_df['CHILDES_Count'].sum()
        GS_adopt['childes_1m_rel_sum'] = GS_adopt['norm'].apply(lambda norm: sum_lemmas(norm, childes_1m_df)/total_count_1m)
        total_count_2_4m = childes_2_4m_df['CHILDES_Count'].sum()
        GS_adopt['childes_2_4m_rel_sum'] = GS_adopt['norm'].apply(lambda norm: sum_lemmas(norm, childes_2_4m_df)/total_count_2_4m)
        total_count_5_7m = childes_5_7m_df['CHILDES_Count'].sum()
        GS_adopt['childes_5_7m_rel_sum'] = GS_adopt['norm'].apply(lambda norm: sum_lemmas(norm, childes_5_7m_df)/total_count_5_7m)

        # generate the local dataframe for different domains
        local_fruit = GS_adopt.loc[GS_adopt['domain_x']=='fruit']
        local_tool = GS_adopt.loc[GS_adopt['domain_x']=='tool']
        local_music = GS_adopt.loc[GS_adopt['domain_x']=='music']
        local_furniture = GS_adopt.loc[GS_adopt['domain_x']=='furn']
        local_garments = GS_adopt.loc[GS_adopt['domain_x']=='garm']

        # LocalModel test
        result_dict[time] = {'1M':{'fruit':{}, 'tool':{}, 'music':{}, 'furniture':{}, 'garments':{}}, '2_4M':{'fruit':{}, 'tool':{}, 'music':{}, 'furniture':{}, 'garments':{}}, '5_7M':{'fruit':{}, 'tool':{}, 'music':{}, 'furniture':{}, 'garments':{}}}
        sized_corpora = 'childes_1m_rel_sum'
        result_kappa_acc, result_importance = local_model_test(local_fruit, features, sized_corpora, target)
        result_dict[time]['1M']['fruit']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['1M']['fruit']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_tool, features, sized_corpora, target)
        result_dict[time]['1M']['tool']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['1M']['tool']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_music, features, sized_corpora, target)
        result_dict[time]['1M']['music']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['1M']['music']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_furniture, features, sized_corpora, target)
        result_dict[time]['1M']['furniture']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['1M']['furniture']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_garments, features, sized_corpora, target)
        result_dict[time]['1M']['garments']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['1M']['garments']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]


        sized_corpora = 'childes_2_4m_rel_sum'
        result_kappa_acc, result_importance = local_model_test(local_fruit, features, sized_corpora, target)
        result_dict[time]['2_4M']['fruit']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['2_4M']['fruit']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_tool, features, sized_corpora, target)
        result_dict[time]['2_4M']['tool']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['2_4M']['tool']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_music, features, sized_corpora, target)
        result_dict[time]['2_4M']['music']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['2_4M']['music']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_furniture, features, sized_corpora, target)
        result_dict[time]['2_4M']['furniture']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['2_4M']['furniture']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_garments, features, sized_corpora, target)
        result_dict[time]['2_4M']['garments']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['2_4M']['garments']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]


        sized_corpora = 'childes_5_7m_rel_sum'
        result_kappa_acc, result_importance = local_model_test(local_fruit, features, sized_corpora, target)
        result_dict[time]['5_7M']['fruit']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['5_7M']['fruit']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_tool, features, sized_corpora, target)
        result_dict[time]['5_7M']['tool']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['5_7M']['tool']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_music, features, sized_corpora, target)
        result_dict[time]['5_7M']['music']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['5_7M']['music']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_furniture, features, sized_corpora, target)
        result_dict[time]['5_7M']['furniture']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['5_7M']['furniture']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_garments, features, sized_corpora, target)
        result_dict[time]['5_7M']['garments']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['5_7M']['garments']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

    return result_dict

In [77]:
multiResult = multiSample_CHILDES(50)

In [78]:
multiResult

{0: {'1M': {'fruit': {'kappa': 0.8815743474370018,
    'balanced acc': 0.9349999999999999},
   'tool': {'kappa': 0.7902051254881443, 'balanced acc': 0.901818181818182},
   'music': {'kappa': 0.595493106279205, 'balanced acc': 0.7994642857142857},
   'furniture': {'kappa': 0.49185508818912177,
    'balanced acc': 0.7665441176470588},
   'garments': {'kappa': 0.49721343731624834,
    'balanced acc': 0.765151515151515}},
  '2_4M': {'fruit': {'kappa': 0.8963526725601545,
    'balanced acc': 0.9433333333333334},
   'tool': {'kappa': 0.7902051254881443, 'balanced acc': 0.901818181818182},
   'music': {'kappa': 0.5923955587704285, 'balanced acc': 0.7969642857142857},
   'furniture': {'kappa': 0.49185508818912177,
    'balanced acc': 0.7665441176470588},
   'garments': {'kappa': 0.4661023262051372,
    'balanced acc': 0.7484848484848484}},
  '5_7M': {'fruit': {'kappa': 0.8963526725601545,
    'balanced acc': 0.9433333333333334},
   'tool': {'kappa': 0.7902051254881443, 'balanced acc': 0.901818

In [79]:
np.save('LocalModel_50_times_sampling_CHILDES.npy', multiResult, allow_pickle=True)

In [None]:
multiResult = np.load('LocalModel_50_times_sampling_CHILDES.npy', allow_pickle=True)
multiResult.item()

In [81]:
# integrate kappa / balanced acc
multiResult_integration = multiResult

domains = ['fruit', 'tool', 'music', 'furniture', 'garments']

for k, v in multiResult_integration.items():
    kappa_1m_list = []
    balancedacc_1m_list = []
    kappa_2_4m_list = []
    balancedacc_2_4m_list = []
    kappa_5_7m_list = []
    balancedacc_5_7m_list = []
    for domain in domains:
        kappa_1m_list.append(v['1M'][domain]['kappa'])
        balancedacc_1m_list.append(v['1M'][domain]['balanced acc'])
        kappa_2_4m_list.append(v['2_4M'][domain]['kappa'])
        balancedacc_2_4m_list.append(v['2_4M'][domain]['balanced acc'])
        kappa_5_7m_list.append(v['5_7M'][domain]['kappa'])
        balancedacc_5_7m_list.append(v['5_7M'][domain]['balanced acc'])
    kappa_avg_1m = np.mean(kappa_1m_list)
    balancedacc_avg_1m = np.mean(balancedacc_1m_list)
    kappa_avg_2_4m = np.mean(kappa_2_4m_list)
    balancedacc_avg_2_4m = np.mean(balancedacc_2_4m_list)
    kappa_avg_5_7m = np.mean(kappa_5_7m_list)
    balancedacc_avg_5_7m = np.mean(balancedacc_5_7m_list)

    multiResult_integration[k]['1M'] = {'kappa': kappa_avg_1m, 'balanced acc': balancedacc_avg_1m}
    multiResult_integration[k]['2_4M'] = {'kappa': kappa_avg_2_4m, 'balanced acc': balancedacc_avg_2_4m}
    multiResult_integration[k]['5_7M'] = {'kappa': kappa_avg_5_7m, 'balanced acc': balancedacc_avg_5_7m}

multiResult_integration

{0: {'1M': {'kappa': 0.6512682209419444, 'balanced acc': 0.8335956200662082},
  '2_4M': {'kappa': 0.6473821542425973, 'balanced acc': 0.8314289533995416},
  '5_7M': {'kappa': 0.6537621235102349, 'balanced acc': 0.8351047873694932}},
 1: {'1M': {'kappa': 0.6497405670700738, 'balanced acc': 0.8347573847720907},
  '2_4M': {'kappa': 0.6488882949112132, 'balanced acc': 0.8317714540361599},
  '5_7M': {'kappa': 0.650830483529121, 'balanced acc': 0.8319202635599694}},
 2: {'1M': {'kappa': 0.64633435460436, 'balanced acc': 0.8340631207028266},
  '2_4M': {'kappa': 0.6657521192207778, 'balanced acc': 0.8421881207028266},
  '5_7M': {'kappa': 0.6586105171334354, 'balanced acc': 0.8363547873694932}},
 3: {'1M': {'kappa': 0.6506500819347195, 'balanced acc': 0.8348921886936592},
  '2_4M': {'kappa': 0.6514042791523497, 'balanced acc': 0.83352145403616},
  '5_7M': {'kappa': 0.6398680200471659, 'balanced acc': 0.8284002419149477}},
 4: {'1M': {'kappa': 0.6527856777909037, 'balanced acc': 0.83610478736949

In [82]:
np.save('LocalModel_inte_50_times_sampling_CHILDES.npy', multiResult_integration, allow_pickle=True)

## BNC by 50 times sampling corpus

In [84]:
bnc_full_reader = BNCCorpusReader(root='../../corpora/BNC/2554.zip/download/Texts/', fileids=r'[A-K]/\w*/\w*\.xml')
bnc_full = bnc_full_reader.words()
bnc_df = pd.Series(bnc_full)
random_seed = 7

In [85]:
# define a function to perform several times sampling
def multiSample_BNC(n):
    result_dict = {}
    for time in range(n):
        bnc_100m = bnc_df.sample(n=100000000)
        bnc_5_7m = bnc_df.sample(n=5700000)
        bnc_2_4m = bnc_df.sample(n=2400000)
        bnc_1m = bnc_df.sample(n=1000000)

        # create a word count from dictionary into dataframe
        # addition: convert all words into low case
        bnc_100m_word_count_dict = nltk.FreqDist(word.lower() for word in bnc_100m)
        bnc_100m_word_count_df = pd.DataFrame.from_dict(bnc_100m_word_count_dict, orient='index').reset_index()
        bnc_100m_word_count_df = bnc_100m_word_count_df.rename(columns={'index':'norm', 0:'BNC_100m_Count'})
        bnc_100m_word_count_df = bnc_100m_word_count_df.set_index('norm')

        bnc_5_7m_word_count_dict = nltk.FreqDist(word.lower() for word in bnc_5_7m)
        bnc_5_7m_word_count_df = pd.DataFrame.from_dict(bnc_5_7m_word_count_dict, orient='index').reset_index()
        bnc_5_7m_word_count_df = bnc_5_7m_word_count_df.rename(columns={'index':'norm', 0:'BNC_5_7m_Count'})
        bnc_5_7m_word_count_df = bnc_5_7m_word_count_df.set_index('norm')

        bnc_2_4m_word_count_dict = nltk.FreqDist(word.lower() for word in bnc_2_4m)
        bnc_2_4m_word_count_df = pd.DataFrame.from_dict(bnc_2_4m_word_count_dict, orient='index').reset_index()
        bnc_2_4m_word_count_df = bnc_2_4m_word_count_df.rename(columns={'index':'norm', 0:'BNC_2_4m_Count'})
        bnc_2_4m_word_count_df = bnc_2_4m_word_count_df.set_index('norm')

        bnc_1m_word_count_dict = nltk.FreqDist(word.lower() for word in bnc_1m)
        bnc_1m_word_count_df = pd.DataFrame.from_dict(bnc_1m_word_count_dict, orient='index').reset_index()
        bnc_1m_word_count_df = bnc_1m_word_count_df.rename(columns={'index':'norm', 0:'BNC_1m_Count'})
        bnc_1m_word_count_df = bnc_1m_word_count_df.set_index('norm')

        # childes_rel_sum: The sum of all instances of each lemma per synset in the CHILDES corpus, devided by the total number of words in the corpus
        GS_adopt['bnc_100m_sum'] = GS_adopt['norm'].apply(lambda norm: sum_lemmas(norm, bnc_100m_word_count_df))
        GS_adopt['bnc_5_7m_sum'] = GS_adopt['norm'].apply(lambda norm: sum_lemmas(norm, bnc_5_7m_word_count_df))
        GS_adopt['bnc_2_4m_sum'] = GS_adopt['norm'].apply(lambda norm: sum_lemmas(norm, bnc_2_4m_word_count_df))
        GS_adopt['bnc_1m_sum'] = GS_adopt['norm'].apply(lambda norm: sum_lemmas(norm, bnc_1m_word_count_df))

        # generate the local dataframe for different domains
        local_fruit = GS_adopt.loc[GS_adopt['domain_x']=='fruit']
        local_tool = GS_adopt.loc[GS_adopt['domain_x']=='tool']
        local_music = GS_adopt.loc[GS_adopt['domain_x']=='music']
        local_furniture = GS_adopt.loc[GS_adopt['domain_x']=='furn']
        local_garments = GS_adopt.loc[GS_adopt['domain_x']=='garm']

        # GlobalModel test
        result_dict[time] = {'1M':{'fruit':{}, 'tool':{}, 'music':{}, 'furniture':{}, 'garments':{}}, '2_4M':{'fruit':{}, 'tool':{}, 'music':{}, 'furniture':{}, 'garments':{}}, '5_7M':{'fruit':{}, 'tool':{}, 'music':{}, 'furniture':{}, 'garments':{}}, '100M':{'fruit':{}, 'tool':{}, 'music':{}, 'furniture':{}, 'garments':{}}}

        sized_corpora = 'bnc_1m_sum'
        result_kappa_acc, result_importance = local_model_test(local_fruit, features, sized_corpora, target)
        result_dict[time]['1M']['fruit']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['1M']['fruit']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_tool, features, sized_corpora, target)
        result_dict[time]['1M']['tool']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['1M']['tool']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_music, features, sized_corpora, target)
        result_dict[time]['1M']['music']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['1M']['music']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_furniture, features, sized_corpora, target)
        result_dict[time]['1M']['furniture']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['1M']['furniture']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_garments, features, sized_corpora, target)
        result_dict[time]['1M']['garments']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['1M']['garments']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]


        sized_corpora = 'bnc_2_4m_sum'
        result_kappa_acc, result_importance = local_model_test(local_fruit, features, sized_corpora, target)
        result_dict[time]['2_4M']['fruit']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['2_4M']['fruit']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_tool, features, sized_corpora, target)
        result_dict[time]['2_4M']['tool']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['2_4M']['tool']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_music, features, sized_corpora, target)
        result_dict[time]['2_4M']['music']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['2_4M']['music']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_furniture, features, sized_corpora, target)
        result_dict[time]['2_4M']['furniture']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['2_4M']['furniture']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_garments, features, sized_corpora, target)
        result_dict[time]['2_4M']['garments']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['2_4M']['garments']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]


        sized_corpora = 'bnc_5_7m_sum'
        result_kappa_acc, result_importance = local_model_test(local_fruit, features, sized_corpora, target)
        result_dict[time]['5_7M']['fruit']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['5_7M']['fruit']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_tool, features, sized_corpora, target)
        result_dict[time]['5_7M']['tool']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['5_7M']['tool']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_music, features, sized_corpora, target)
        result_dict[time]['5_7M']['music']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['5_7M']['music']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_furniture, features, sized_corpora, target)
        result_dict[time]['5_7M']['furniture']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['5_7M']['furniture']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_garments, features, sized_corpora, target)
        result_dict[time]['5_7M']['garments']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['5_7M']['garments']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]


        sized_corpora = 'bnc_100m_sum'
        result_kappa_acc, result_importance = local_model_test(local_fruit, features, sized_corpora, target)
        result_dict[time]['100M']['fruit']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['100M']['fruit']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_tool, features, sized_corpora, target)
        result_dict[time]['100M']['tool']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['100M']['tool']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_music, features, sized_corpora, target)
        result_dict[time]['100M']['music']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['100M']['music']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_furniture, features, sized_corpora, target)
        result_dict[time]['100M']['furniture']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['100M']['furniture']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]

        result_kappa_acc, result_importance = local_model_test(local_garments, features, sized_corpora, target)
        result_dict[time]['100M']['garments']['kappa']  = result_kappa_acc['cohen kappa / 10'][0]
        result_dict[time]['100M']['garments']['balanced acc'] = result_kappa_acc['balanced acc / 10'][0]


    return result_dict

In [None]:
multiResult = multiSample_BNC(50)
np.save('LocalModel_50_times_sampling_BNC.npy', multiResult, allow_pickle=True)
multiResult

In [91]:
# integrate kappa / balanced acc
multiResult_integration = multiResult

domains = ['fruit', 'tool', 'music', 'furniture', 'garments']

for k, v in multiResult_integration.items():
    kappa_1m_list = []
    balancedacc_1m_list = []
    kappa_2_4m_list = []
    balancedacc_2_4m_list = []
    kappa_5_7m_list = []
    balancedacc_5_7m_list = []
    kappa_100m_list = []
    balancedacc_100m_list = []
    for domain in domains:
        kappa_1m_list.append(v['1M'][domain]['kappa'])
        balancedacc_1m_list.append(v['1M'][domain]['balanced acc'])
        kappa_2_4m_list.append(v['2_4M'][domain]['kappa'])
        balancedacc_2_4m_list.append(v['2_4M'][domain]['balanced acc'])
        kappa_5_7m_list.append(v['5_7M'][domain]['kappa'])
        balancedacc_5_7m_list.append(v['5_7M'][domain]['balanced acc'])
        kappa_100m_list.append(v['100M'][domain]['kappa'])
        balancedacc_100m_list.append(v['100M'][domain]['balanced acc'])
    kappa_avg_1m = np.mean(kappa_1m_list)
    balancedacc_avg_1m = np.mean(balancedacc_1m_list)
    kappa_avg_2_4m = np.mean(kappa_2_4m_list)
    balancedacc_avg_2_4m = np.mean(balancedacc_2_4m_list)
    kappa_avg_5_7m = np.mean(kappa_5_7m_list)
    balancedacc_avg_5_7m = np.mean(balancedacc_5_7m_list)
    kappa_avg_100m = np.mean(kappa_100m_list)
    balancedacc_avg_100m = np.mean(balancedacc_100m_list)

    multiResult_integration[k]['1M'] = {'kappa': kappa_avg_1m, 'balanced acc': balancedacc_avg_1m}
    multiResult_integration[k]['2_4M'] = {'kappa': kappa_avg_2_4m, 'balanced acc': balancedacc_avg_2_4m}
    multiResult_integration[k]['5_7M'] = {'kappa': kappa_avg_5_7m, 'balanced acc': balancedacc_avg_5_7m}
    multiResult_integration[k]['100M'] = {'kappa': kappa_avg_100m, 'balanced acc': balancedacc_avg_100m}

multiResult_integration

{0: {'1M': {'kappa': 0.6470424644287811, 'balanced acc': 0.8312758785332315},
  '2_4M': {'kappa': 0.6552133260870072, 'balanced acc': 0.8369300674815381},
  '5_7M': {'kappa': 0.6506981870013071, 'balanced acc': 0.834162210338681},
  '100M': {'kappa': 0.6555387301443621, 'balanced acc': 0.8386909218232749}},
 1: {'1M': {'kappa': 0.6346779579645317, 'balanced acc': 0.8254425451998981},
  '2_4M': {'kappa': 0.6449075294725398, 'balanced acc': 0.8339576648841355},
  '5_7M': {'kappa': 0.6531903726195339, 'balanced acc': 0.8361552075375606},
  '100M': {'kappa': 0.6524116250365676, 'balanced acc': 0.8361075884899416}},
 2: {'1M': {'kappa': 0.6381432206646503, 'balanced acc': 0.8251232811306342},
  '2_4M': {'kappa': 0.6715571426239183, 'balanced acc': 0.8447514960529666},
  '5_7M': {'kappa': 0.6442431192265248, 'balanced acc': 0.8318743315508021},
  '100M': {'kappa': 0.6509780762681474, 'balanced acc': 0.8376075884899414}},
 3: {'1M': {'kappa': 0.6531131785585582, 'balanced acc': 0.839027979373

In [92]:
np.save('LocalModel_inte_50_times_sampling_BNC.npy', multiResult_integration, allow_pickle=True)

In [41]:
# sanity check
check = []
for item in multiResult.values():
    for size in item.values():
        check.append(size['music']['kappa'])

check

[0.5749485905539664,
 0.5918317074370834,
 0.5710472216525975,
 0.5918317074370834,
 0.5761454329272795,
 0.5918317074370834,
 0.5918317074370834,
 0.5918317074370834,
 0.5749485905539664,
 0.5918317074370834]