In [144]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, cohen_kappa_score, balanced_accuracy_score

import warnings
warnings.filterwarnings("ignore")

In [145]:
# read the pre-processed data all agreed
data = pd.read_csv('GS_All_Agreed.csv', index_col=0)

# generate the local dataframe for different domains
local_fruit = data.loc[data['domain_x']=='fruit'].reset_index()
local_tool = data.loc[data['domain_x']=='tool'].reset_index()
local_music = data.loc[data['domain_x']=='music'].reset_index()
local_furniture = data.loc[data['domain_x']=='furn'].reset_index()
local_garments = data.loc[data['domain_x']=='garm'].reset_index()

In [146]:
# define features and target
features = ['ngrams_last_mean',
            'nrdirhypers_x',
            'nrhypos_x',
            'nrpartrels_normalised_x',
            'depthfromtopsynset_normalised_x',
            'glosslength_normalised_x',
            'minwordlength_x',
            'nroflemmas_x',
            'polyscore_max_x']
target = ['vote_x'] # nb / b

In [147]:
# specific corpora features
# written general
# features.append('BNC_sum')
# BNCSumFeatures = features
# features.append('CABNC_per100,000')
# CABNC_per100KFeatures = features
# features.append('KBNC_Sum')
# KBNCSumFeatures = features
features.append('CHILDES_Sum_Rel')
CHILDESSumRelFeatures = features

In [164]:
# split training set and testing set using K-Flod
K = 10
random_seed = 7 # R
local = local_fruit
X = local[CHILDESSumRelFeatures]
y = local[target]

K_Flod = StratifiedKFold(n_splits=K, shuffle=True, random_state=random_seed)
K_Flod.get_n_splits(X, y)
cohen_kappa = []
balanced_acc = []
for train_index, test_index in K_Flod.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # SMOTE algorithm
    smote = SMOTE(random_state=random_seed, k_neighbors=2)
    X_train, y_train = smote.fit_resample(X_train, y_train)

    # define random forest model
    rf = RandomForestClassifier(random_state=random_seed, max_features='sqrt', n_estimators=1400, min_samples_split=2, min_samples_leaf=1, max_depth=50, oob_score=True, criterion='gini', bootstrap=True).fit(X_train, y_train)

    # predict and make score
    pipeline = make_pipeline(smote, rf)
    y_pred = pipeline.predict(X_test)

    kappa = cohen_kappa_score(y_test, y_pred)
    cohen_kappa.append(kappa)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
    balanced_acc.append(balanced_accuracy)

results = classification_report(y_test, y_pred, output_dict=True)
results = pd.DataFrame(results).transpose()

results['cohen kappa / 10'] = np.mean(cohen_kappa)
results['balanced acc / 10'] = np.mean(balanced_acc)
results['local'] = list(local['domain_x'])[0]

In [165]:
results

Unnamed: 0,precision,recall,f1-score,support,cohen kappa / 10,balanced acc / 10,local
b,1.0,1.0,1.0,6.0,0.896353,0.943333,fruit
nb,1.0,1.0,1.0,9.0,0.896353,0.943333,fruit
accuracy,1.0,1.0,1.0,1.0,0.896353,0.943333,fruit
macro avg,1.0,1.0,1.0,15.0,0.896353,0.943333,fruit
weighted avg,1.0,1.0,1.0,15.0,0.896353,0.943333,fruit


In [166]:
# importance of features
importance = rf.feature_importances_

importance = pd.DataFrame([features, importance]).transpose()
importance = importance.rename(columns={0:'feature', 1:'importance'}).sort_values('importance', ascending=False)

In [167]:
importance

Unnamed: 0,feature,importance
4,depthfromtopsynset_normalised_x,0.440666
3,nrpartrels_normalised_x,0.178546
5,glosslength_normalised_x,0.103472
6,minwordlength_x,0.072002
0,ngrams_last_mean,0.065721
9,CHILDES_Sum_Rel,0.049904
8,polyscore_max_x,0.049832
2,nrhypos_x,0.025508
7,nroflemmas_x,0.011539
1,nrdirhypers_x,0.00281
