In [1]:
import os, sys
import pickle
import numpy as np
from sklearn import tree

sys.path.insert(1, os.path.join(sys.path[0], '../src'))
import document_encoder

In [2]:
CLASSIFIER_PATH = '../classifiers/RF_by-article_stats-and-counts.pickle'
DOC_ENCODER     = '../generated_datasets/DocEncoder_by-article_stats-and-counts.pickle'
DATASET_PATH    = '../generated_datasets/by-article_stats-and-counts.npz'

In [3]:
rf_clf = pickle.load(open(CLASSIFIER_PATH, 'rb'))  ## Random Forest Classifier
encoder = pickle.load(open(DOC_ENCODER, 'rb'))
counter = encoder.counter  ## CountVectorizer

In [4]:
## Invert word->idx mapping to get features' names
idx_to_word = {idx: word for word, idx in counter.vocabulary_.items()}
word_feature_names = [idx_to_word[i] for i in sorted(idx_to_word.keys())]

statistical_feature_names = [
    'num_sentences',       ## Number of sentences
    'avg_sent_word_len',   ## Average sentence length in words
    'avg_sent_char_len',   ## Average sentence length in chars
    'var_sent_char_len',   ## Variance of sentence length in chars
    'avg_word_len',        ## Average word length
    'var_word_len',        ## Variance of word length
    'punct_freq',          ## Frequency of punctuation
    'capital_freq',        ## Frequency of capital letters
    'ratio_atoms_to_types' ## Types to atoms ratio
]

feature_names = statistical_feature_names + word_feature_names

In [5]:
data = np.load(DATASET_PATH)
X, y = data['X'], data['y']

In [6]:
features_by_importance = list(reversed(np.argsort(rf_clf.feature_importances_)))
print(rf_clf.feature_importances_)
print([feature_names[i] for i in features_by_importance])

[0.07098779 0.03089282 0.03175361 0.03549271 0.03012492 0.03813169
 0.03928453 0.0453966  0.04400098 0.02051239 0.03893597 0.00742571
 0.01225441 0.00461132 0.01265352 0.00978777 0.00850962 0.00760899
 0.00943164 0.00852886 0.00765101 0.00630694 0.00381572 0.00464075
 0.01347229 0.00917908 0.00669274 0.00522424 0.01901321 0.0104065
 0.01493992 0.02267638 0.00512259 0.00948857 0.00877956 0.01144004
 0.01083058 0.0115706  0.00659671 0.01233621 0.00641018 0.03302811
 0.03064863 0.01109274 0.00887792 0.01149064 0.01460757 0.01150031
 0.0067093  0.01234446 0.01576297 0.00787356 0.00808254 0.0355903
 0.03026035 0.01584296 0.00913141 0.01011466 0.01412192]
['num_sentences', 'capital_freq', 'ratio_atoms_to_types', 'punct_freq', 'american', 'var_word_len', 'trump', 'var_sent_char_len', 'polit', 'avg_sent_char_len', 'avg_sent_word_len', 'presid', 'use', 'avg_word_len', 'make', 'america', 'just', 'want', 'thing', 'like', 'said', 'year', 'hillari', 'countri', 'support', 'peopl', 'clinton', 'obama'

In [7]:
featureA = features_by_importance[2]
featureA_name = feature_names[featureA]
featureB = features_by_importance[4]
featureB_name = feature_names[featureB]

a_min, a_max = X[:, featureA].min(), X[:, featureA].max()
b_min, b_max = X[:, featureB].min(), X[:, featureB].max()

a_diff, b_diff = a_max - a_min, b_max - b_min
aa, bb = np.meshgrid(
    np.arange(a_min - a_diff / 5, a_max + a_diff / 5, a_diff / 20),
    np.arange(b_min - b_diff / 5, b_max + b_diff / 5, (b_max - b_min) / 20)
)

In [8]:
X_avg = np.median(X, axis=0)

X_synthetic = np.array([
    [a if i == featureA else b if i == featureB else X_avg[i] for i in range(len(X_avg))] \
    for a, b in np.c_[aa.ravel(), bb.ravel()]
])

In [9]:
y_pred = rf_clf.predict(X_synthetic)

In [10]:
y_pred

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.