In [10]:
import pandas as pd

from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import make_pipeline

import warnings
warnings.filterwarnings("ignore")

In [2]:
# read the pre-processed data
train = pd.read_csv('training_data_final.csv', index_col=0)
test = pd.read_csv('test_data_final.csv', index_col=0)

In [3]:
# define features and target
features = ['ngrams_last_mean',
            'nrdirhypers',
            'nrhypos',
            'nrpartrels_normalised',
            'depthfromtopsynset_normalised',
            'glosslength_normalised',
            'minwordlength',
            'nroflemmas',
            'polyscore_max']
target = ['vote'] # nb / b

In [7]:
random_seed = 7 # R
# SMOTE algorithm
smote = SMOTE(random_state=random_seed, k_neighbors=2)
X_train, y_train = smote.fit_resample(train[features], train[target])
print ('Shape of oversampled data: {}'.format(X_train.shape))
print ('Shape of Y: {}'.format(y_train.shape))

Shape of oversampled data: (1384, 9)
Shape of Y: (1384, 1)


In [9]:
# define random forest model
rf = RandomForestClassifier(random_state=random_seed, max_features="sqrt", n_estimators=500, oob_score=True, criterion='gini', bootstrap=True).fit(X_train, y_train)

In [11]:
# importance of features
importance = rf.feature_importances_

importance = pd.DataFrame([features, importance]).transpose()
importance = importance.rename(columns={0:'feature', 1:'importance'}).sort_values('importance', ascending=False)

importance

Unnamed: 0,feature,importance
4,depthfromtopsynset_normalised,0.367898
0,ngrams_last_mean,0.152633
5,glosslength_normalised,0.150407
6,minwordlength,0.113265
3,nrpartrels_normalised,0.079112
2,nrhypos,0.066548
8,polyscore_max,0.038926
7,nroflemmas,0.028913
1,nrdirhypers,0.0023


In [12]:
# predict and make score
pipeline = make_pipeline(smote, rf)
y_pred = pipeline.predict(test[features])

In [13]:
results = pd.read_csv('results_syns_list.csv', index_col=0)
results['Predicted'] = pd.DataFrame(y_pred)[0]

basic_level = results[results['Predicted']=='b']

basic_level

Unnamed: 0,Synset,Name,Predicted
30,Synset('eidos.n.01'),eidos.n.01,b
60,Synset('voice.n.02'),voice.n.02,b
80,Synset('rare_earth.n.01'),rare_earth.n.01,b
88,Synset('varna.n.02'),varna.n.02,b
96,Synset('playing_period.n.01'),playing_period.n.01,b
...,...,...,...
74128,Synset('hart.n.03'),hart.n.03,b
74129,Synset('hind.n.02'),hind.n.02,b
74140,Synset('mustang.n.01'),mustang.n.01,b
74162,Synset('hack.n.08'),hack.n.08,b
