In [16]:
import pandas as pd

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, cohen_kappa_score, balanced_accuracy_score

import warnings
warnings.filterwarnings("ignore")

In [17]:
# read the pre-processed data all agreed
data = pd.read_csv('EX1_Input_AllAgreed.csv', index_col=0)

# generate the local dataframe for different domains
local_fruit = data.loc[data['Domain']=='fruit']
local_tool = data.loc[data['Domain']=='tool']
local_music = data.loc[data['Domain']=='music']
local_furniture = data.loc[data['Domain']=='furn']
local_garments = data.loc[data['Domain']=='garm']

In [18]:
# define features and target
features = ['ngrams_last_mean',
            'nrdirhypers',
            'nrhypos',
            'nrpartrels_normalised',
            'depthfromtopsynset_normalised',
            'glosslength_normalised',
            'minwordlength',
            'nroflemmas',
            'polyscore_max']
target = ['vote'] # nb / b

In [19]:
# split training set and testing set
split_ratio = 0.8
random_seed = 7 # R
# data_in = pd.concat([local_fruit, local_tool, local_music])
data_in = data

X_train, X_test, y_train, y_test = train_test_split(data_in[features], data_in[target], train_size=split_ratio, random_state=random_seed)

In [20]:
# SMOTE algorithm
smote = SMOTE(random_state=random_seed, k_neighbors=2)
X_train, y_train = smote.fit_resample(X_train, y_train)
print ('Shape of oversampled data: {}'.format(X_train.shape))
print ('Shape of Y: {}'.format(y_train.shape))

Shape of oversampled data: (1074, 9)
Shape of Y: (1074, 1)


In [21]:
# define random forest model
rf = RandomForestClassifier(random_state=random_seed, max_features="sqrt", n_estimators=500, oob_score=True, criterion='gini', bootstrap=True).fit(X_train, y_train)

  rf = RandomForestClassifier(random_state=random_seed, max_features="sqrt", n_estimators=500, oob_score=True, criterion='gini', bootstrap=True).fit(X_train, y_train)


In [22]:
# predict and make score
pipeline = make_pipeline(smote, rf)

y_pred = pipeline.predict(X_test)
results = classification_report(y_test, y_pred, output_dict=True)
results = pd.DataFrame(results).transpose()

kappa = cohen_kappa_score(y_test, y_pred)
balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
results['cohen kappa'] = kappa
results['balanced acc'] = balanced_accuracy
results['global'] = 5

In [23]:
results

Unnamed: 0,precision,recall,f1-score,support,cohen kappa,balanced acc,global
b,0.756098,0.756098,0.756098,41.0,0.677357,0.838679,5
nb,0.92126,0.92126,0.92126,127.0,0.677357,0.838679,5
accuracy,0.880952,0.880952,0.880952,0.880952,0.677357,0.838679,5
macro avg,0.838679,0.838679,0.838679,168.0,0.677357,0.838679,5
weighted avg,0.880952,0.880952,0.880952,168.0,0.677357,0.838679,5


In [24]:
# importance of features
importance = rf.feature_importances_

importance = pd.DataFrame([features, importance]).transpose()
importance = importance.rename(columns={0:'feature', 1:'importance'}).sort_values('importance', ascending=False)

In [25]:
importance

Unnamed: 0,feature,importance
4,depthfromtopsynset_normalised,0.357654
0,ngrams_last_mean,0.164617
5,glosslength_normalised,0.127739
6,minwordlength,0.11503
3,nrpartrels_normalised,0.109267
2,nrhypos,0.053442
8,polyscore_max,0.043895
7,nroflemmas,0.026327
1,nrdirhypers,0.002028
