In [125]:
import pandas as pd

from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, cohen_kappa_score, balanced_accuracy_score

In [126]:
# read the pre-processed data all agreed
data = pd.read_csv('EX1_Input_AllAgreed.csv', index_col=0)

# generate the local dataframe for different domains
local_fruit = data.loc[data['Domain']=='fruit']
local_tool = data.loc[data['Domain']=='tool']
local_music = data.loc[data['Domain']=='music']
local_furniture = data.loc[data['Domain']=='furn']
local_garments = data.loc[data['Domain']=='garm']

unseen_fruit = data.loc[data['Domain']!='fruit']
unseen_tool = data.loc[data['Domain']!='tool']
unseen_music = data.loc[data['Domain']!='music']
unseen_furniture = data.loc[data['Domain']!='furn']
unseen_garments = data.loc[data['Domain']!='garm']

In [127]:
# define features and target
features = ['ngrams_last_mean',
            'nrdirhypers',
            'nrhypos',
            'nrpartrels_normalised',
            'depthfromtopsynset_normalised',
            'glosslength_normalised',
            'minwordlength',
            'nroflemmas',
            'polyscore_max']
target = ['vote'] # nb / b

In [129]:
# split training set and testing set
# split_ratio = 0.8
random_seed = 7 # R
unseen = unseen_garments
local = local_garments

X_train, X_test, y_train, y_test = unseen[features], local[features], unseen[target], local[target]

In [130]:
# SMOTE algorithm
smote = SMOTE(random_state=random_seed, k_neighbors=2)
X_train, y_train = smote.fit_resample(X_train, y_train)
print ('Shape of oversampled data: {}'.format(X_train.shape))
print ('Shape of Y: {}'.format(y_train.shape))

Shape of oversampled data: (898, 9)
Shape of Y: (898, 1)


In [131]:
# define random forest model
rf = RandomForestClassifier(random_state=random_seed, max_features="sqrt", n_estimators=500, oob_score=True, criterion='gini', bootstrap=True).fit(X_train, y_train)

  rf = RandomForestClassifier(random_state=random_seed, max_features="sqrt", n_estimators=500, oob_score=True, criterion='gini', bootstrap=True).fit(X_train, y_train)


In [132]:
# predict and make score
pipeline = make_pipeline(smote, rf)

y_pred = pipeline.predict(X_test)
results = classification_report(y_test, y_pred, output_dict=True)
results = pd.DataFrame(results).transpose()

kappa = cohen_kappa_score(y_test, y_pred)
balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
results['cohen kappa'] = kappa
results['balanced acc'] = balanced_accuracy
results['unseen'] = list(local['Domain'])[0]

In [133]:
results

Unnamed: 0,precision,recall,f1-score,support,cohen kappa,balanced acc,unseen
b,0.279412,0.730769,0.404255,26.0,0.294068,0.751431,garm
nb,0.959538,0.772093,0.85567,215.0,0.294068,0.751431,garm
accuracy,0.767635,0.767635,0.767635,0.767635,0.294068,0.751431,garm
macro avg,0.619475,0.751431,0.629963,241.0,0.294068,0.751431,garm
weighted avg,0.886163,0.767635,0.80697,241.0,0.294068,0.751431,garm


In [134]:
# importance of features
importance = rf.feature_importances_

importance = pd.DataFrame([features, importance]).transpose()
importance = importance.rename(columns={0:'feature', 1:'importance'}).sort_values('importance', ascending=False)

In [135]:
importance

Unnamed: 0,feature,importance
4,depthfromtopsynset_normalised,0.406564
5,glosslength_normalised,0.147833
0,ngrams_last_mean,0.138011
6,minwordlength,0.134541
3,nrpartrels_normalised,0.061559
2,nrhypos,0.042766
8,polyscore_max,0.041067
7,nroflemmas,0.025817
1,nrdirhypers,0.001842
