In [1]:
import pylangacq
import pandas as pd
import numpy as np
from nltk.corpus import wordnet
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, cohen_kappa_score, balanced_accuracy_score
from nltk.corpus.reader.bnc import BNCCorpusReader

import warnings
warnings.filterwarnings("ignore")

In [2]:
# only once run for download WordNet or update
import nltk
# nltk.download('wordnet', download_dir='./')
nltk.data.path.append('../../')

In [3]:
# inherit features from Gold Standard dataset
GS_all_agreed = pd.read_csv('../../sampled_count/GS_All_Agreed.csv', index_col=0)
features_target = ['Synsets','domain_x',
                   'nrdirhypers_x',
                   'nrhypos_x',
                   'nrpartrels_normalised_x',
                   'depthfromtopsynset_normalised_x',
                   'glosslength_normalised_x',
                   'minwordlength_x',
                   'nroflemmas_x',
                   'polyscore_max_x',
                   'vote_x']
GS_adopt = GS_all_agreed[features_target]

# extract norms from synsets
GS_adopt['norm'] = GS_adopt['Synsets'].str.split("'").str[1].str.split('.').str[0]
GS_adopt = GS_adopt.set_index('norm').reset_index()

In [4]:
# matching norms with corpora
def sum_lemmas(norm, corpora):
    # search norm in WordNet
    synsets_list = wordnet.synsets(norm)
    lemmas = []
    feq_count = 0
    for synset in synsets_list:
        # extract lemmas from every synset
        lemmas += [str(lemma.name()) for lemma in synset.lemmas()]
    for lemma in lemmas:
        # check each lemma in corpora
        feq_count += corpora.loc[lemma == corpora.index].to_numpy().sum()

    return feq_count

In [5]:
# define features and target
features = ['nrdirhypers_x',
            'nrhypos_x',
            'nrpartrels_normalised_x',
            'depthfromtopsynset_normalised_x',
            'glosslength_normalised_x',
            'minwordlength_x',
            'nroflemmas_x',
            'polyscore_max_x']
target = ['vote_x'] # nb / b

In [9]:
# use four domains to train and the other to test
def transfer_model_test(training, testing, feature, sized_corpora, target):
    random_seed = 7 # R
    training_data = training.reset_index()
    testing_data = testing.reset_index()
    feature_list = feature + [sized_corpora]
    X_train = training_data[feature_list]
    y_train = training_data[target]
    X_test = testing_data[feature_list]
    y_test = testing_data[target]

    # SMOTE algorithm
    smote = SMOTE(random_state=random_seed, k_neighbors=2)
    X_train, y_train = smote.fit_resample(X_train, y_train)

    # define random forest model
    rf = RandomForestClassifier(random_state=random_seed, max_features='sqrt', n_estimators=1400, min_samples_split=2, min_samples_leaf=1, max_depth=50, oob_score=True, criterion='gini', bootstrap=True).fit(X_train, y_train)

    # predict and make score
    pipeline = make_pipeline(smote, rf)
    y_pred = pipeline.predict(X_test)

    kappa = cohen_kappa_score(y_test, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

    results = classification_report(y_test, y_pred, output_dict=True)
    results = pd.DataFrame(results).transpose()

    results['cohen kappa'] = kappa
    results['balanced acc'] = balanced_accuracy
    results['unseen'] = list(testing['domain_x'])[0]

    # importance of features
    # importance = rf.feature_importances_
    # importance = pd.DataFrame([features, importance]).transpose()
    # importance = importance.rename(columns={0:'feature', 1:'importance'}).sort_values('importance', ascending=False)

    importance = pd.DataFrame() # no importance
    return results, importance

## CABNC by 50 times sampling corpus

In [8]:
CABNC_full = pylangacq.read_chat('../../corpora/CABNC/CABNC.zip')
words = CABNC_full.words()
cabnc_df = pd.Series(words)
random_seed = 7

In [10]:
# define a function to perform several times sampling
def multiSample_CABNC(n):
    result_dict = {}
    for time in range(n):
        cabnc_2_4m = cabnc_df.sample(n=2400000)
        cabnc_1m = cabnc_df.sample(n=1000000)

        # create a word count from dictionary into dataframe
        # addition: convert all words into low case
        cabnc_2_4m_freq = nltk.FreqDist(word.lower() for word in cabnc_2_4m)
        cabnc_2_4m_df = pd.DataFrame.from_dict(cabnc_2_4m_freq, orient='index').reset_index().rename(columns={0: 'CABNC_Count', 'index':'norm'}).set_index('norm')
        cabnc_1m_freq = nltk.FreqDist(word.lower() for word in cabnc_1m)
        cabnc_1m_df = pd.DataFrame.from_dict(cabnc_1m_freq, orient='index').reset_index().rename(columns={0: 'CABNC_Count', 'index':'norm'}).set_index('norm')

        # cabnc_per_100k: The frequency occurence of all lemmas per synset, per 100,000 words of the CABNC
        GS_adopt['cabnc_per_100k_2_4m'] = GS_adopt['norm'].apply(lambda norm: (sum_lemmas(norm, cabnc_2_4m_df)/2400000)*100000)
        GS_adopt['cabnc_per_100k_1m'] = GS_adopt['norm'].apply(lambda norm: (sum_lemmas(norm, cabnc_1m_df)/1000000)*100000)

        # generate the local dataframe for different domains
        local_fruit = GS_adopt.loc[GS_adopt['domain_x']=='fruit']
        local_tool = GS_adopt.loc[GS_adopt['domain_x']=='tool']
        local_music = GS_adopt.loc[GS_adopt['domain_x']=='music']
        local_furniture = GS_adopt.loc[GS_adopt['domain_x']=='furn']
        local_garments = GS_adopt.loc[GS_adopt['domain_x']=='garm']

        unseen_fruit = GS_adopt.loc[GS_adopt['domain_x']!='fruit']
        unseen_tool = GS_adopt.loc[GS_adopt['domain_x']!='tool']
        unseen_music = GS_adopt.loc[GS_adopt['domain_x']!='music']
        unseen_furniture = GS_adopt.loc[GS_adopt['domain_x']!='furn']
        unseen_garments = GS_adopt.loc[GS_adopt['domain_x']!='garm']

        # TransferModel test
        result_dict[time] = {'1M':{'fruit':{}, 'tool':{}, 'music':{}, 'furniture':{}, 'garments':{}}, '2_4M':{'fruit':{}, 'tool':{}, 'music':{}, 'furniture':{}, 'garments':{}}}
        sized_corpora = 'cabnc_per_100k_1m'
        result_kappa_acc, result_importance = transfer_model_test(unseen_fruit, local_fruit, features, sized_corpora, target)
        result_dict[time]['1M']['fruit']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['1M']['fruit']['balanced acc'] = result_kappa_acc['balanced acc'][0]

        result_kappa_acc, result_importance = transfer_model_test(unseen_tool, local_tool, features, sized_corpora, target)
        result_dict[time]['1M']['tool']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['1M']['tool']['balanced acc'] = result_kappa_acc['balanced acc'][0]

        result_kappa_acc, result_importance = transfer_model_test(unseen_music, local_music, features, sized_corpora, target)
        result_dict[time]['1M']['music']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['1M']['music']['balanced acc'] = result_kappa_acc['balanced acc'][0]

        result_kappa_acc, result_importance = transfer_model_test(unseen_furniture, local_furniture, features, sized_corpora, target)
        result_dict[time]['1M']['furniture']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['1M']['furniture']['balanced acc'] = result_kappa_acc['balanced acc'][0]

        result_kappa_acc, result_importance = transfer_model_test(unseen_garments, local_garments, features, sized_corpora, target)
        result_dict[time]['1M']['garments']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['1M']['garments']['balanced acc'] = result_kappa_acc['balanced acc'][0]


        sized_corpora = 'cabnc_per_100k_2_4m'
        result_kappa_acc, result_importance = transfer_model_test(unseen_fruit, local_fruit, features, sized_corpora, target)
        result_dict[time]['2_4M']['fruit']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['2_4M']['fruit']['balanced acc'] = result_kappa_acc['balanced acc'][0]

        result_kappa_acc, result_importance = transfer_model_test(unseen_tool, local_tool, features, sized_corpora, target)
        result_dict[time]['2_4M']['tool']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['2_4M']['tool']['balanced acc'] = result_kappa_acc['balanced acc'][0]

        result_kappa_acc, result_importance = transfer_model_test(unseen_music, local_music, features, sized_corpora, target)
        result_dict[time]['2_4M']['music']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['2_4M']['music']['balanced acc'] = result_kappa_acc['balanced acc'][0]

        result_kappa_acc, result_importance = transfer_model_test(unseen_furniture, local_furniture, features, sized_corpora, target)
        result_dict[time]['2_4M']['furniture']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['2_4M']['furniture']['balanced acc'] = result_kappa_acc['balanced acc'][0]

        result_kappa_acc, result_importance = transfer_model_test(unseen_garments, local_garments, features, sized_corpora, target)
        result_dict[time]['2_4M']['garments']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['2_4M']['garments']['balanced acc'] = result_kappa_acc['balanced acc'][0]

    return result_dict

In [14]:
multiResult = multiSample_CABNC(50)

In [15]:
multiResult

{0: {'1M': {'fruit': {'kappa': 0.6912419594260266,
    'balanced acc': 0.8713450292397661},
   'tool': {'kappa': 0.8302024439175634, 'balanced acc': 0.9214814814814815},
   'music': {'kappa': 0.5471385125794858, 'balanced acc': 0.7664960948020468},
   'furniture': {'kappa': 0.2743330930064888,
    'balanced acc': 0.7334355828220859},
   'garments': {'kappa': 0.2385663230952504,
    'balanced acc': 0.7059928443649374}},
  '2_4M': {'fruit': {'kappa': 0.6559796437659033,
    'balanced acc': 0.8426368952684742},
   'tool': {'kappa': 0.8302024439175634, 'balanced acc': 0.9214814814814815},
   'music': {'kappa': 0.5471385125794858, 'balanced acc': 0.7664960948020468},
   'furniture': {'kappa': 0.29661573075841385,
    'balanced acc': 0.742638036809816},
   'garments': {'kappa': 0.2742881293253737,
    'balanced acc': 0.7322003577817531}}},
 1: {'1M': {'fruit': {'kappa': 0.6778523489932886,
    'balanced acc': 0.8490164805954279},
   'tool': {'kappa': 0.8302024439175634, 'balanced acc': 0.921

In [16]:
np.save('TransferModel_50_times_sampling_CABNC.npy', multiResult, allow_pickle=True)

In [62]:
multiResult = np.load('TransferModel_50_times_sampling_CABNC.npy', allow_pickle=True)
multiResult.item()

In [17]:
# integrate kappa / balanced acc
multiResult_integration = multiResult

domains = ['fruit', 'tool', 'music', 'furniture', 'garments']

for k, v in multiResult_integration.items():
    kappa_1m_list = []
    balancedacc_1m_list = []
    kappa_2_4m_list = []
    balancedacc_2_4m_list = []
    for domain in domains:
        kappa_1m_list.append(v['1M'][domain]['kappa'])
        balancedacc_1m_list.append(v['1M'][domain]['balanced acc'])
        kappa_2_4m_list.append(v['2_4M'][domain]['kappa'])
        balancedacc_2_4m_list.append(v['2_4M'][domain]['balanced acc'])
    kappa_avg_1m = np.mean(kappa_1m_list)
    balancedacc_avg_1m = np.mean(balancedacc_1m_list)
    kappa_avg_2_4m = np.mean(kappa_2_4m_list)
    balancedacc_avg_2_4m = np.mean(balancedacc_2_4m_list)

    multiResult_integration[k]['1M'] = {'kappa': kappa_avg_1m, 'balanced acc': balancedacc_avg_1m}
    multiResult_integration[k]['2_4M'] = {'kappa': kappa_avg_2_4m, 'balanced acc': balancedacc_avg_2_4m}

multiResult_integration

{0: {'1M': {'kappa': 0.516296466404963, 'balanced acc': 0.7997502065420635},
  '2_4M': {'kappa': 0.520844892069348, 'balanced acc': 0.8010905732287142}},
 1: {'1M': {'kappa': 0.5145936428799491, 'balanced acc': 0.7972609234719934},
  '2_4M': {'kappa': 0.5155522712042081, 'balanced acc': 0.798326086253701}},
 2: {'1M': {'kappa': 0.5073199469141623, 'balanced acc': 0.797782322208544},
  '2_4M': {'kappa': 0.5133490087589732, 'balanced acc': 0.7974568675464766}},
 3: {'1M': {'kappa': 0.5031241789254377, 'balanced acc': 0.7925105510493801},
  '2_4M': {'kappa': 0.5155522712042081, 'balanced acc': 0.798326086253701}},
 4: {'1M': {'kappa': 0.5147877694463832, 'balanced acc': 0.7979772491785948},
  '2_4M': {'kappa': 0.509734961689295, 'balanced acc': 0.7958407679544293}},
 5: {'1M': {'kappa': 0.5019687035413225, 'balanced acc': 0.790343543496357},
  '2_4M': {'kappa': 0.5155522712042081, 'balanced acc': 0.798326086253701}},
 6: {'1M': {'kappa': 0.5230132623060562, 'balanced acc': 0.8031993675411

In [18]:
np.save('TransferModel_inte_50_times_sampling_CABNC.npy', multiResult_integration, allow_pickle=True)

## CHILDES by 50 times sampling corpus

In [19]:
# load sub-corporas
brown = pylangacq.read_chat("../../corpora/CHILDES/Brown.zip")
belf = pylangacq.read_chat("../../corpora/CHILDES/Belfast.zip")
crutt=pylangacq.read_chat("../../corpora/CHILDES/Cruttenden.zip")
fletcher=pylangacq.read_chat("../../corpora/CHILDES/Fletcher.zip")
forr=pylangacq.read_chat("../../corpora/CHILDES/Forrester.zip")
gath=pylangacq.read_chat("../../corpora/CHILDES/Gathburn.zip")
howe=pylangacq.read_chat("../../corpora/CHILDES/Howe.zip")
kelly=pylangacq.read_chat("../../corpora/CHILDES/KellyQuigley.zip")
korman=pylangacq.read_chat("../../corpora/CHILDES/Korman.zip")
lara=pylangacq.read_chat("../../corpora/CHILDES/Lara.zip")
manc=pylangacq.read_chat("../../corpora/CHILDES/Manchester.zip")
nuff=pylangacq.read_chat("../../corpora/CHILDES/Nuffield.zip")
quigley=pylangacq.read_chat("../../corpora/CHILDES/QuigleyMcNally.zip")
sekali=pylangacq.read_chat("../../corpora/CHILDES/Sekali.zip")
smith=pylangacq.read_chat("../../corpora/CHILDES/Smith.zip")
tommer=pylangacq.read_chat("../../corpora/CHILDES/Tommerdahl.zip")

# link up sub-corporas
corpora = [brown, belf, crutt, fletcher, forr, gath, howe, kelly, korman, lara, manc, nuff, quigley, sekali, smith, tommer]
reader = pylangacq.Reader()
for item in corpora:
    reader.append(item)

# preparation sample for 5.7m, 2.4m, 1m
words = reader.words()
childes_df = pd.Series(words)
random_seed = 7

In [22]:
# define a function to perform several times sampling
def multiSample_CHILDES(n):
    result_dict = {}
    for time in range(n):
        childes_5_7m = childes_df.sample(n=5700000, replace=True)
        childes_2_4m = childes_df.sample(n=2400000)
        childes_1m = childes_df.sample(n=1000000)

        # create a word count from dictionary into dataframe
        # addition: convert all words into low case
        childes_5_7m_freq = nltk.FreqDist(word.lower() for word in childes_5_7m)
        childes_5_7m_df = pd.DataFrame.from_dict(childes_5_7m_freq, orient='index').reset_index().rename(columns={0: 'CHILDES_Count', 'index':'norm'}).set_index('norm')
        childes_2_4m_freq = nltk.FreqDist(word.lower() for word in childes_2_4m)
        childes_2_4m_df = pd.DataFrame.from_dict(childes_2_4m_freq, orient='index').reset_index().rename(columns={0: 'CHILDES_Count', 'index':'norm'}).set_index('norm')
        childes_1m_freq = nltk.FreqDist(word.lower() for word in childes_1m)
        childes_1m_df = pd.DataFrame.from_dict(childes_1m_freq, orient='index').reset_index().rename(columns={0: 'CHILDES_Count', 'index':'norm'}).set_index('norm')

        # childes_rel_sum: The sum of all instances of each lemma per synset in the CHILDES corpus, devided by the total number of words in the corpus
        total_count_1m = childes_1m_df['CHILDES_Count'].sum()
        GS_adopt['childes_1m_rel_sum'] = GS_adopt['norm'].apply(lambda norm: sum_lemmas(norm, childes_1m_df)/total_count_1m)
        total_count_2_4m = childes_2_4m_df['CHILDES_Count'].sum()
        GS_adopt['childes_2_4m_rel_sum'] = GS_adopt['norm'].apply(lambda norm: sum_lemmas(norm, childes_2_4m_df)/total_count_2_4m)
        total_count_5_7m = childes_5_7m_df['CHILDES_Count'].sum()
        GS_adopt['childes_5_7m_rel_sum'] = GS_adopt['norm'].apply(lambda norm: sum_lemmas(norm, childes_5_7m_df)/total_count_5_7m)

        # generate the local dataframe for different domains
        local_fruit = GS_adopt.loc[GS_adopt['domain_x']=='fruit']
        local_tool = GS_adopt.loc[GS_adopt['domain_x']=='tool']
        local_music = GS_adopt.loc[GS_adopt['domain_x']=='music']
        local_furniture = GS_adopt.loc[GS_adopt['domain_x']=='furn']
        local_garments = GS_adopt.loc[GS_adopt['domain_x']=='garm']

        unseen_fruit = GS_adopt.loc[GS_adopt['domain_x']!='fruit']
        unseen_tool = GS_adopt.loc[GS_adopt['domain_x']!='tool']
        unseen_music = GS_adopt.loc[GS_adopt['domain_x']!='music']
        unseen_furniture = GS_adopt.loc[GS_adopt['domain_x']!='furn']
        unseen_garments = GS_adopt.loc[GS_adopt['domain_x']!='garm']

        # TransferModel test
        result_dict[time] = {'1M':{'fruit':{}, 'tool':{}, 'music':{}, 'furniture':{}, 'garments':{}}, '2_4M':{'fruit':{}, 'tool':{}, 'music':{}, 'furniture':{}, 'garments':{}}, '5_7M':{'fruit':{}, 'tool':{}, 'music':{}, 'furniture':{}, 'garments':{}}}
        sized_corpora = 'childes_1m_rel_sum'
        result_kappa_acc, result_importance = transfer_model_test(unseen_fruit, local_fruit, features, sized_corpora, target)
        result_dict[time]['1M']['fruit']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['1M']['fruit']['balanced acc'] = result_kappa_acc['balanced acc'][0]

        result_kappa_acc, result_importance = transfer_model_test(unseen_tool, local_tool, features, sized_corpora, target)
        result_dict[time]['1M']['tool']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['1M']['tool']['balanced acc'] = result_kappa_acc['balanced acc'][0]

        result_kappa_acc, result_importance = transfer_model_test(unseen_music, local_music, features, sized_corpora, target)
        result_dict[time]['1M']['music']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['1M']['music']['balanced acc'] = result_kappa_acc['balanced acc'][0]

        result_kappa_acc, result_importance = transfer_model_test(unseen_furniture, local_furniture, features, sized_corpora, target)
        result_dict[time]['1M']['furniture']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['1M']['furniture']['balanced acc'] = result_kappa_acc['balanced acc'][0]

        result_kappa_acc, result_importance = transfer_model_test(unseen_garments, local_garments, features, sized_corpora, target)
        result_dict[time]['1M']['garments']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['1M']['garments']['balanced acc'] = result_kappa_acc['balanced acc'][0]


        sized_corpora = 'childes_2_4m_rel_sum'
        result_kappa_acc, result_importance = transfer_model_test(unseen_fruit, local_fruit, features, sized_corpora, target)
        result_dict[time]['2_4M']['fruit']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['2_4M']['fruit']['balanced acc'] = result_kappa_acc['balanced acc'][0]

        result_kappa_acc, result_importance = transfer_model_test(unseen_tool, local_tool, features, sized_corpora, target)
        result_dict[time]['2_4M']['tool']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['2_4M']['tool']['balanced acc'] = result_kappa_acc['balanced acc'][0]

        result_kappa_acc, result_importance = transfer_model_test(unseen_music, local_music, features, sized_corpora, target)
        result_dict[time]['2_4M']['music']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['2_4M']['music']['balanced acc'] = result_kappa_acc['balanced acc'][0]

        result_kappa_acc, result_importance = transfer_model_test(unseen_furniture, local_furniture, features, sized_corpora, target)
        result_dict[time]['2_4M']['furniture']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['2_4M']['furniture']['balanced acc'] = result_kappa_acc['balanced acc'][0]

        result_kappa_acc, result_importance = transfer_model_test(unseen_garments, local_garments, features, sized_corpora, target)
        result_dict[time]['2_4M']['garments']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['2_4M']['garments']['balanced acc'] = result_kappa_acc['balanced acc'][0]


        sized_corpora = 'childes_5_7m_rel_sum'
        result_kappa_acc, result_importance = transfer_model_test(unseen_fruit, local_fruit, features, sized_corpora, target)
        result_dict[time]['5_7M']['fruit']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['5_7M']['fruit']['balanced acc'] = result_kappa_acc['balanced acc'][0]

        result_kappa_acc, result_importance = transfer_model_test(unseen_tool, local_tool, features, sized_corpora, target)
        result_dict[time]['5_7M']['tool']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['5_7M']['tool']['balanced acc'] = result_kappa_acc['balanced acc'][0]

        result_kappa_acc, result_importance = transfer_model_test(unseen_music, local_music, features, sized_corpora, target)
        result_dict[time]['5_7M']['music']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['5_7M']['music']['balanced acc'] = result_kappa_acc['balanced acc'][0]

        result_kappa_acc, result_importance = transfer_model_test(unseen_furniture, local_furniture, features, sized_corpora, target)
        result_dict[time]['5_7M']['furniture']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['5_7M']['furniture']['balanced acc'] = result_kappa_acc['balanced acc'][0]

        result_kappa_acc, result_importance = transfer_model_test(unseen_garments, local_garments, features, sized_corpora, target)
        result_dict[time]['5_7M']['garments']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['5_7M']['garments']['balanced acc'] = result_kappa_acc['balanced acc'][0]

    return result_dict

In [None]:
multiResult = multiSample_CHILDES(50)

In [None]:
multiResult

In [None]:
np.save('TransferModel_50_times_sampling_CHILDES.npy', multiResult, allow_pickle=True)

In [None]:
multiResult = np.load('TransferModel_50_times_sampling_CHILDES.npy', allow_pickle=True)
multiResult.item()

In [None]:
# integrate kappa / balanced acc
multiResult_integration = multiResult

domains = ['fruit', 'tool', 'music', 'furniture', 'garments']

for k, v in multiResult_integration.items():
    kappa_1m_list = []
    balancedacc_1m_list = []
    kappa_2_4m_list = []
    balancedacc_2_4m_list = []
    kappa_5_7m_list = []
    balancedacc_5_7m_list = []
    for domain in domains:
        kappa_1m_list.append(v['1M'][domain]['kappa'])
        balancedacc_1m_list.append(v['1M'][domain]['balanced acc'])
        kappa_2_4m_list.append(v['2_4M'][domain]['kappa'])
        balancedacc_2_4m_list.append(v['2_4M'][domain]['balanced acc'])
        kappa_5_7m_list.append(v['5_7M'][domain]['kappa'])
        balancedacc_5_7m_list.append(v['5_7M'][domain]['balanced acc'])
    kappa_avg_1m = np.mean(kappa_1m_list)
    balancedacc_avg_1m = np.mean(balancedacc_1m_list)
    kappa_avg_2_4m = np.mean(kappa_2_4m_list)
    balancedacc_avg_2_4m = np.mean(balancedacc_2_4m_list)
    kappa_avg_5_7m = np.mean(kappa_5_7m_list)
    balancedacc_avg_5_7m = np.mean(balancedacc_5_7m_list)

    multiResult_integration[k]['1M'] = {'kappa': kappa_avg_1m, 'balanced acc': balancedacc_avg_1m}
    multiResult_integration[k]['2_4M'] = {'kappa': kappa_avg_2_4m, 'balanced acc': balancedacc_avg_2_4m}
    multiResult_integration[k]['5_7M'] = {'kappa': kappa_avg_5_7m, 'balanced acc': balancedacc_avg_5_7m}

multiResult_integration

In [None]:
np.save('TransferModel_inte_50_times_sampling_CHILDES.npy', multiResult_integration, allow_pickle=True)

## BNC by 50 times sampling corpus

In [None]:
bnc_full_reader = BNCCorpusReader(root='../../corpora/BNC/2554.zip/download/Texts/', fileids=r'[A-K]/\w*/\w*\.xml')
bnc_full = bnc_full_reader.words()
bnc_df = pd.Series(bnc_full)
random_seed = 7

In [None]:
# define a function to perform several times sampling
def multiSample_BNC(n):
    result_dict = {}
    for time in range(n):
        bnc_100m = bnc_df.sample(n=100000000)
        bnc_5_7m = bnc_df.sample(n=5700000)
        bnc_2_4m = bnc_df.sample(n=2400000)
        bnc_1m = bnc_df.sample(n=1000000)

        # create a word count from dictionary into dataframe
        # addition: convert all words into low case
        bnc_100m_word_count_dict = nltk.FreqDist(word.lower() for word in bnc_100m)
        bnc_100m_word_count_df = pd.DataFrame.from_dict(bnc_100m_word_count_dict, orient='index').reset_index()
        bnc_100m_word_count_df = bnc_100m_word_count_df.rename(columns={'index':'norm', 0:'BNC_100m_Count'})
        bnc_100m_word_count_df = bnc_100m_word_count_df.set_index('norm')

        bnc_5_7m_word_count_dict = nltk.FreqDist(word.lower() for word in bnc_5_7m)
        bnc_5_7m_word_count_df = pd.DataFrame.from_dict(bnc_5_7m_word_count_dict, orient='index').reset_index()
        bnc_5_7m_word_count_df = bnc_5_7m_word_count_df.rename(columns={'index':'norm', 0:'BNC_5_7m_Count'})
        bnc_5_7m_word_count_df = bnc_5_7m_word_count_df.set_index('norm')

        bnc_2_4m_word_count_dict = nltk.FreqDist(word.lower() for word in bnc_2_4m)
        bnc_2_4m_word_count_df = pd.DataFrame.from_dict(bnc_2_4m_word_count_dict, orient='index').reset_index()
        bnc_2_4m_word_count_df = bnc_2_4m_word_count_df.rename(columns={'index':'norm', 0:'BNC_2_4m_Count'})
        bnc_2_4m_word_count_df = bnc_2_4m_word_count_df.set_index('norm')

        bnc_1m_word_count_dict = nltk.FreqDist(word.lower() for word in bnc_1m)
        bnc_1m_word_count_df = pd.DataFrame.from_dict(bnc_1m_word_count_dict, orient='index').reset_index()
        bnc_1m_word_count_df = bnc_1m_word_count_df.rename(columns={'index':'norm', 0:'BNC_1m_Count'})
        bnc_1m_word_count_df = bnc_1m_word_count_df.set_index('norm')

        # childes_rel_sum: The sum of all instances of each lemma per synset in the CHILDES corpus, devided by the total number of words in the corpus
        GS_adopt['bnc_100m_sum'] = GS_adopt['norm'].apply(lambda norm: sum_lemmas(norm, bnc_100m_word_count_df))
        GS_adopt['bnc_5_7m_sum'] = GS_adopt['norm'].apply(lambda norm: sum_lemmas(norm, bnc_5_7m_word_count_df))
        GS_adopt['bnc_2_4m_sum'] = GS_adopt['norm'].apply(lambda norm: sum_lemmas(norm, bnc_2_4m_word_count_df))
        GS_adopt['bnc_1m_sum'] = GS_adopt['norm'].apply(lambda norm: sum_lemmas(norm, bnc_1m_word_count_df))

        # generate the local dataframe for different domains
        local_fruit = GS_adopt.loc[GS_adopt['domain_x']=='fruit']
        local_tool = GS_adopt.loc[GS_adopt['domain_x']=='tool']
        local_music = GS_adopt.loc[GS_adopt['domain_x']=='music']
        local_furniture = GS_adopt.loc[GS_adopt['domain_x']=='furn']
        local_garments = GS_adopt.loc[GS_adopt['domain_x']=='garm']

        unseen_fruit = GS_adopt.loc[GS_adopt['domain_x']!='fruit']
        unseen_tool = GS_adopt.loc[GS_adopt['domain_x']!='tool']
        unseen_music = GS_adopt.loc[GS_adopt['domain_x']!='music']
        unseen_furniture = GS_adopt.loc[GS_adopt['domain_x']!='furn']
        unseen_garments = GS_adopt.loc[GS_adopt['domain_x']!='garm']

        # GlobalModel test
        result_dict[time] = {'1M':{'fruit':{}, 'tool':{}, 'music':{}, 'furniture':{}, 'garments':{}}, '2_4M':{'fruit':{}, 'tool':{}, 'music':{}, 'furniture':{}, 'garments':{}}, '5_7M':{'fruit':{}, 'tool':{}, 'music':{}, 'furniture':{}, 'garments':{}}, '100M':{'fruit':{}, 'tool':{}, 'music':{}, 'furniture':{}, 'garments':{}}}

        sized_corpora = 'bnc_1m_sum'
        result_kappa_acc, result_importance = transfer_model_test(unseen_fruit, local_fruit, features, sized_corpora, target)
        result_dict[time]['1M']['fruit']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['1M']['fruit']['balanced acc'] = result_kappa_acc['balanced acc'][0]

        result_kappa_acc, result_importance = transfer_model_test(unseen_tool, local_tool, features, sized_corpora, target)
        result_dict[time]['1M']['tool']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['1M']['tool']['balanced acc'] = result_kappa_acc['balanced acc'][0]

        result_kappa_acc, result_importance = transfer_model_test(unseen_music, local_music, features, sized_corpora, target)
        result_dict[time]['1M']['music']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['1M']['music']['balanced acc'] = result_kappa_acc['balanced acc'][0]

        result_kappa_acc, result_importance = transfer_model_test(unseen_furniture, local_furniture, features, sized_corpora, target)
        result_dict[time]['1M']['furniture']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['1M']['furniture']['balanced acc'] = result_kappa_acc['balanced acc'][0]

        result_kappa_acc, result_importance = transfer_model_test(unseen_garments, local_garments, features, sized_corpora, target)
        result_dict[time]['1M']['garments']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['1M']['garments']['balanced acc'] = result_kappa_acc['balanced acc'][0]


        sized_corpora = 'bnc_2_4m_sum'
        result_kappa_acc, result_importance = transfer_model_test(unseen_fruit, local_fruit, features, sized_corpora, target)
        result_dict[time]['2_4M']['fruit']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['2_4M']['fruit']['balanced acc'] = result_kappa_acc['balanced acc'][0]

        result_kappa_acc, result_importance = transfer_model_test(unseen_tool, local_tool, features, sized_corpora, target)
        result_dict[time]['2_4M']['tool']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['2_4M']['tool']['balanced acc'] = result_kappa_acc['balanced acc'][0]

        result_kappa_acc, result_importance = transfer_model_test(unseen_music, local_music, features, sized_corpora, target)
        result_dict[time]['2_4M']['music']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['2_4M']['music']['balanced acc'] = result_kappa_acc['balanced acc'][0]

        result_kappa_acc, result_importance = transfer_model_test(unseen_furniture, local_furniture, features, sized_corpora, target)
        result_dict[time]['2_4M']['furniture']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['2_4M']['furniture']['balanced acc'] = result_kappa_acc['balanced acc'][0]

        result_kappa_acc, result_importance = transfer_model_test(unseen_garments, local_garments, features, sized_corpora, target)
        result_dict[time]['2_4M']['garments']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['2_4M']['garments']['balanced acc'] = result_kappa_acc['balanced acc'][0]


        sized_corpora = 'bnc_5_7m_sum'
        result_kappa_acc, result_importance = transfer_model_test(unseen_fruit, local_fruit, features, sized_corpora, target)
        result_dict[time]['5_7M']['fruit']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['5_7M']['fruit']['balanced acc'] = result_kappa_acc['balanced acc'][0]

        result_kappa_acc, result_importance = transfer_model_test(unseen_tool, local_tool, features, sized_corpora, target)
        result_dict[time]['5_7M']['tool']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['5_7M']['tool']['balanced acc'] = result_kappa_acc['balanced acc'][0]

        result_kappa_acc, result_importance = transfer_model_test(unseen_music, local_music, features, sized_corpora, target)
        result_dict[time]['5_7M']['music']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['5_7M']['music']['balanced acc'] = result_kappa_acc['balanced acc'][0]

        result_kappa_acc, result_importance = transfer_model_test(unseen_furniture, local_furniture, features, sized_corpora, target)
        result_dict[time]['5_7M']['furniture']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['5_7M']['furniture']['balanced acc'] = result_kappa_acc['balanced acc'][0]

        result_kappa_acc, result_importance = transfer_model_test(unseen_garments, local_garments, features, sized_corpora, target)
        result_dict[time]['5_7M']['garments']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['5_7M']['garments']['balanced acc'] = result_kappa_acc['balanced acc'][0]


        sized_corpora = 'bnc_100m_sum'
        result_kappa_acc, result_importance = transfer_model_test(unseen_fruit, local_fruit, features, sized_corpora, target)
        result_dict[time]['100M']['fruit']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['100M']['fruit']['balanced acc'] = result_kappa_acc['balanced acc'][0]

        result_kappa_acc, result_importance = transfer_model_test(unseen_tool, local_tool, features, sized_corpora, target)
        result_dict[time]['100M']['tool']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['100M']['tool']['balanced acc'] = result_kappa_acc['balanced acc'][0]

        result_kappa_acc, result_importance = transfer_model_test(unseen_music, local_music, features, sized_corpora, target)
        result_dict[time]['100M']['music']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['100M']['music']['balanced acc'] = result_kappa_acc['balanced acc'][0]

        result_kappa_acc, result_importance = transfer_model_test(unseen_furniture, local_furniture, features, sized_corpora, target)
        result_dict[time]['100M']['furniture']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['100M']['furniture']['balanced acc'] = result_kappa_acc['balanced acc'][0]

        result_kappa_acc, result_importance = transfer_model_test(unseen_garments, local_garments, features, sized_corpora, target)
        result_dict[time]['100M']['garments']['kappa']  = result_kappa_acc['cohen kappa'][0]
        result_dict[time]['100M']['garments']['balanced acc'] = result_kappa_acc['balanced acc'][0]


    return result_dict

In [None]:
multiResult = multiSample_BNC(50)
np.save('TransferModel_50_times_sampling_BNC.npy', multiResult, allow_pickle=True)
multiResult

In [None]:
# integrate kappa / balanced acc
multiResult_integration = multiResult

domains = ['fruit', 'tool', 'music', 'furniture', 'garments']

for k, v in multiResult_integration.items():
    kappa_1m_list = []
    balancedacc_1m_list = []
    kappa_2_4m_list = []
    balancedacc_2_4m_list = []
    kappa_5_7m_list = []
    balancedacc_5_7m_list = []
    kappa_100m_list = []
    balancedacc_100m_list = []
    for domain in domains:
        kappa_1m_list.append(v['1M'][domain]['kappa'])
        balancedacc_1m_list.append(v['1M'][domain]['balanced acc'])
        kappa_2_4m_list.append(v['2_4M'][domain]['kappa'])
        balancedacc_2_4m_list.append(v['2_4M'][domain]['balanced acc'])
        kappa_5_7m_list.append(v['5_7M'][domain]['kappa'])
        balancedacc_5_7m_list.append(v['5_7M'][domain]['balanced acc'])
        kappa_100m_list.append(v['100M'][domain]['kappa'])
        balancedacc_100m_list.append(v['100M'][domain]['balanced acc'])
    kappa_avg_1m = np.mean(kappa_1m_list)
    balancedacc_avg_1m = np.mean(balancedacc_1m_list)
    kappa_avg_2_4m = np.mean(kappa_2_4m_list)
    balancedacc_avg_2_4m = np.mean(balancedacc_2_4m_list)
    kappa_avg_5_7m = np.mean(kappa_5_7m_list)
    balancedacc_avg_5_7m = np.mean(balancedacc_5_7m_list)
    kappa_avg_100m = np.mean(kappa_100m_list)
    balancedacc_avg_100m = np.mean(balancedacc_100m_list)

    multiResult_integration[k]['1M'] = {'kappa': kappa_avg_1m, 'balanced acc': balancedacc_avg_1m}
    multiResult_integration[k]['2_4M'] = {'kappa': kappa_avg_2_4m, 'balanced acc': balancedacc_avg_2_4m}
    multiResult_integration[k]['5_7M'] = {'kappa': kappa_avg_5_7m, 'balanced acc': balancedacc_avg_5_7m}
    multiResult_integration[k]['100M'] = {'kappa': kappa_avg_100m, 'balanced acc': balancedacc_avg_100m}

multiResult_integration

In [None]:
np.save('TransferModel_inte_50_times_sampling_BNC.npy', multiResult_integration, allow_pickle=True)