In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, cohen_kappa_score, balanced_accuracy_score

In [2]:
# read the pre-processed data all agreed
data = pd.read_csv('EX1_Input_AllAgreed.csv', index_col=0)

# generate the local dataframe for different domains
local_fruit = data.loc[data['Domain']=='fruit']
local_tool = data.loc[data['Domain']=='tool']
local_music = data.loc[data['Domain']=='music']
local_furniture = data.loc[data['Domain']=='furn']
local_garments = data.loc[data['Domain']=='garm']

In [3]:
# define features and target
features = ['ngrams_last_mean',
            'nrdirhypers',
            'nrhypos',
            'nrpartrels_normalised',
            'depthfromtopsynset_normalised',
            'glosslength_normalised',
            'minwordlength',
            'nroflemmas',
            'polyscore_max']
target = ['vote'] # nb / b

In [4]:
# split training set and testing set
split_ratio = 0.8
random_seed = 7 # R
local = local_garments

X_train, X_test, y_train, y_test = train_test_split(local[features], local[target], train_size=split_ratio, random_state=random_seed)

In [5]:
# SMOTE algorithm
smote = SMOTE(random_state=random_seed, k_neighbors=2)
X_train, y_train = smote.fit_resample(X_train, y_train)
print ('Shape of oversampled data: {}'.format(X_train.shape))
print ('Shape of Y: {}'.format(y_train.shape))

Shape of oversampled data: (340, 8)
Shape of Y: (340, 1)


In [6]:
# define random forest model
rf = RandomForestClassifier(random_state=random_seed, max_features="sqrt", n_estimators=500, oob_score=True, criterion='gini', bootstrap=True).fit(X_train, y_train)

  rf = RandomForestClassifier(random_state=random_seed, max_features="sqrt", n_estimators=500, oob_score=True, criterion='gini', bootstrap=True).fit(X_train, y_train)


In [7]:
# predict and make score
pipeline = make_pipeline(smote, rf)

y_pred = pipeline.predict(X_test)
results = classification_report(y_test, y_pred, output_dict=True)
results = pd.DataFrame(results).transpose()

kappa = cohen_kappa_score(y_test, y_pred)
balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
results['cohen kappa'] = kappa
results['balanced acc'] = balanced_accuracy
results['local'] = list(local['Domain'])[0]

In [8]:
results

Unnamed: 0,precision,recall,f1-score,support,cohen kappa,balanced acc,local
b,0.5,0.75,0.6,4.0,0.556561,0.841667,garm
nb,0.976744,0.933333,0.954545,45.0,0.556561,0.841667,garm
accuracy,0.918367,0.918367,0.918367,0.918367,0.556561,0.841667,garm
macro avg,0.738372,0.841667,0.777273,49.0,0.556561,0.841667,garm
weighted avg,0.937826,0.918367,0.925603,49.0,0.556561,0.841667,garm


In [29]:
# importance of features
importance = rf.feature_importances_

importance = pd.DataFrame([features, importance]).transpose()
importance = importance.rename(columns={0:'feature', 1:'importance'}).sort_values('importance', ascending=False)

In [30]:
importance

Unnamed: 0,feature,importance
2,nrhypos,0.278247
0,ngrams_last_mean,0.183312
4,depthfromtopsynset_normalised,0.167373
6,minwordlength,0.146328
5,glosslength_normalised,0.133383
8,polyscore_max,0.049269
7,nroflemmas,0.025633
3,nrpartrels_normalised,0.012891
1,nrdirhypers,0.003564
