In [1]:
import ISee
import Profiler

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer

import numpy as np

import Code_vin
import NullHypothesis

import re #REGEX sera utilizado para implementar as heuristicas de pre-processamento

Using TensorFlow backend.


In [2]:
def multisub(patterns, strns, text_pos):
    for strn in strns:
        for pattern, subst in patterns:
            strn[text_pos] = re.sub(pattern, subst, strn[text_pos])
    return strns

In [3]:
#A: bigrams
#B: all characters to lowercase
#C: accentuation removal
#D: special character treatment
#E: stop-words removal
#F: twitter user names removal
#G: twitter topics removal
#H: Reduction of laugh expressions to a common token
laugh_token = '_laugh_'
heuristics = {
    'A': [('not( [^ ]+)', 'not_\1')],
    'B': [('([A-Z]*)', lambda x: x.group(0).lower())],
    'C': [('á', 'a'), ('é', 'e'), ('í', 'i'), ('ó', 'o'), ('ú', 'u'), ('à', 'a'), ('ã', 'a'), ('ẽ', 'e'), ('ĩ', 'i'), ('õ', 'o'), ('ũ', 'u'),  ('â', 'a'), ('ê', 'e'), ('ñ', 'n')],
    'D': [(':\)', ''), (':\(', ''), (':\^\)', ''), (':p', ''), (':3', ''), (':c', ''), ('c:', ''), (':o', '')],
    'E': [(' the ', ' '), (' a ', ' '), (' e ', ' '), (' o ', ' ')],
    'F': [('@[^ ]+', ''), ('@[^ ]+ ', '')],
    'G': [('#[^ ]+', ''), ('#[^ ]+ ', '')],
    'H': [('[ashu]{3,}', laugh_token), ('[ah]{3,}', laugh_token), ('[eh]{3,}', laugh_token), ('k{4,}', laugh_token), ('[rs]{3,}', laugh_token), ('[ehu]{3,}', laugh_token)],
}

In [4]:
#Configuracoes
heuristics_in_use = '' #ABCDEFGH

max_features = 1000
embedding_dims = 50
maxlen = 50

kfold = 5

will_shuffle = True
shuffle_seed = 1234

n_est = 1000

In [5]:
prf = Profiler.Milestones()

[Profiler starting...]


In [6]:
print("Creating ensemble")
ensemble = ISee.Ensemble()
prf.add_milestone("Created ensemble object")

Creating ensemble


[[94mProfiler[0m] @ 17:21:35: Created ensemble object


In [7]:
print("Setting encoder & tokenizer")
ensemble.set_encoder(LabelEncoder())
ensemble.set_tokenizer(Tokenizer(max_features))
prf.add_milestone("Set encoder & tokenizer")

Setting encoder & tokenizer


[[94mProfiler[0m] @ 17:21:36: Set encoder & tokenizer


In [8]:
print("Reading file")
text_position = 4
class_position = 5
file_name = "brexit_blog_corpus.csv"
with open("datasets/" + file_name) as f:
    lines = [line.rstrip('\n').split('\t') for line in f][1:]
    print("Read {} lines".format(len(lines)))
    print("Applying heuristics " + heuristics_in_use + " (" + str(len(heuristics_in_use)) + ")")
    for h in heuristics_in_use:
        lines = multisub(heuristics[h], lines, text_position)
    line_no = 0
    for line in lines:
        line.append(line_no)
        line_no += 1
prf.add_milestone("Read file")

#print(str(lines[15]))
textos  = [str(a[text_position])  for a in lines]
classes = [str(b[class_position]) for b in lines]
n_classes = len(set(classes))
indexes = [c[-1] for c in lines]
prf.add_milestone("Aggregated data")

Reading file
Read 1682 lines
Applying heuristics  (0)


[[94mProfiler[0m] @ 17:21:36: Read file
[[94mProfiler[0m] @ 17:21:36: Aggregated data


In [9]:
print("Setting data")
ensemble.set_datasets(textos, classes, indexes)
prf.add_milestone("Set datasets")

Setting data


[[94mProfiler[0m] @ 17:21:36: Set datasets


In [10]:
print("Fitting encoder & tokenizer")
ensemble.fit_encoder()
ensemble.fit_tokenizer()
prf.add_milestone("Fitted encoder & tokenizer")

Fitting encoder & tokenizer


[[94mProfiler[0m] @ 17:21:36: Fitted encoder & tokenizer


In [11]:
print("Splitting data")
ensemble.split_data(shuffle_seed=1337, purge_duplicates=False)
prf.add_milestone("Split data")

Splitting data


[[94mProfiler[0m] @ 17:21:36: Split data


In [12]:
print("Adding models")
##ensemble.add('Random', NullHypothesis.RandomClassifier())
ensemble.add('XGB',  XGBClassifier(n_estimators=n_est, learning_rate=0.01, max_depth=6, subsample=0.65, colsample_bytree=0.25, gamma=5))
ensemble.add('Forest', RandomForestClassifier(n_estimators=90))
ensemble.add('Naive-B',  GaussianNB())
ensemble.add('SVM_T', svm.SVC(gamma='scale'))
ensemble.add('RNC', Code_vin.RNC_vin(max_features=max_features, embedding_dims=embedding_dims, maxlen=maxlen, filters=100, kernel_size=3, hidden_dims=250, output_dims=n_classes), uses_one_hot=False, uses_categorical=True, uses_argmax=True)
prf.add_milestone("Added models")

Adding models


[[94mProfiler[0m] @ 17:21:36: Added models


In [13]:
print("Training ensemble")
ensemble.train(max_features, maxlen, profiler=prf)
prf.add_milestone("Trained ensemble")

[[94mProfiler[0m] @ 17:21:36: Starting training
[[94mProfiler[0m] @ 17:21:36: Fold 1 - Training model XGB


Training ensemble
Training with the 1-st set


[[94mProfiler[0m] @ 17:25:38: Fold 1 - Trained model XGB
[[94mProfiler[0m] @ 17:25:38: Fold 1 - Training model Forest
[[94mProfiler[0m] @ 17:25:40: Fold 1 - Trained model Forest
[[94mProfiler[0m] @ 17:25:40: Fold 1 - Training model Naive-B
[[94mProfiler[0m] @ 17:25:40: Fold 1 - Trained model Naive-B
[[94mProfiler[0m] @ 17:25:40: Fold 1 - Training model SVM_T
[[94mProfiler[0m] @ 17:25:45: Fold 1 - Trained model SVM_T
[[94mProfiler[0m] @ 17:25:45: Fold 1 - Training model RNC
[[94mProfiler[0m] @ 17:25:57: Fold 1 - Trained model RNC
[[94mProfiler[0m] @ 17:25:57: Fold 2 - Training model XGB


Training with the 2-nd set


[[94mProfiler[0m] @ 17:30:09: Fold 2 - Trained model XGB
[[94mProfiler[0m] @ 17:30:09: Fold 2 - Training model Forest
[[94mProfiler[0m] @ 17:30:11: Fold 2 - Trained model Forest
[[94mProfiler[0m] @ 17:30:11: Fold 2 - Training model Naive-B
[[94mProfiler[0m] @ 17:30:11: Fold 2 - Trained model Naive-B
[[94mProfiler[0m] @ 17:30:11: Fold 2 - Training model SVM_T
[[94mProfiler[0m] @ 17:30:16: Fold 2 - Trained model SVM_T
[[94mProfiler[0m] @ 17:30:16: Fold 2 - Training model RNC
[[94mProfiler[0m] @ 17:30:27: Fold 2 - Trained model RNC
[[94mProfiler[0m] @ 17:30:28: Fold 3 - Training model XGB


Training with the 3-rd set


[[94mProfiler[0m] @ 17:37:57: Fold 3 - Trained model XGB
[[94mProfiler[0m] @ 17:37:57: Fold 3 - Training model Forest
[[94mProfiler[0m] @ 17:38:02: Fold 3 - Trained model Forest
[[94mProfiler[0m] @ 17:38:02: Fold 3 - Training model Naive-B
[[94mProfiler[0m] @ 17:38:03: Fold 3 - Trained model Naive-B
[[94mProfiler[0m] @ 17:38:03: Fold 3 - Training model SVM_T
[[94mProfiler[0m] @ 17:38:16: Fold 3 - Trained model SVM_T
[[94mProfiler[0m] @ 17:38:16: Fold 3 - Training model RNC
[[94mProfiler[0m] @ 17:38:51: Fold 3 - Trained model RNC


Training with the 4-th set


[[94mProfiler[0m] @ 17:38:51: Fold 4 - Training model XGB
[[94mProfiler[0m] @ 17:48:30: Fold 4 - Trained model XGB
[[94mProfiler[0m] @ 17:48:30: Fold 4 - Training model Forest
[[94mProfiler[0m] @ 17:48:34: Fold 4 - Trained model Forest
[[94mProfiler[0m] @ 17:48:34: Fold 4 - Training model Naive-B
[[94mProfiler[0m] @ 17:48:34: Fold 4 - Trained model Naive-B
[[94mProfiler[0m] @ 17:48:34: Fold 4 - Training model SVM_T
[[94mProfiler[0m] @ 17:48:45: Fold 4 - Trained model SVM_T
[[94mProfiler[0m] @ 17:48:45: Fold 4 - Training model RNC
[[94mProfiler[0m] @ 17:49:05: Fold 4 - Trained model RNC


Training with the 5-th set


[[94mProfiler[0m] @ 17:49:06: Fold 5 - Training model XGB
[[94mProfiler[0m] @ 17:58:06: Fold 5 - Trained model XGB
[[94mProfiler[0m] @ 17:58:06: Fold 5 - Training model Forest
[[94mProfiler[0m] @ 17:58:10: Fold 5 - Trained model Forest
[[94mProfiler[0m] @ 17:58:10: Fold 5 - Training model Naive-B
[[94mProfiler[0m] @ 17:58:10: Fold 5 - Trained model Naive-B
[[94mProfiler[0m] @ 17:58:10: Fold 5 - Training model SVM_T
[[94mProfiler[0m] @ 17:58:22: Fold 5 - Trained model SVM_T
[[94mProfiler[0m] @ 17:58:22: Fold 5 - Training model RNC


Finished training


[[94mProfiler[0m] @ 17:58:43: Fold 5 - Trained model RNC
[[94mProfiler[0m] @ 17:58:43: Trained ensemble


In [14]:
print("Testing ensemble")
res = ensemble.test(max_features, maxlen, profiler=prf)
prf.add_milestone("Tested ensemble")

Testing ensemble
Testing with the 1-st set


[[94mProfiler[0m] @ 17:58:43: Starting testing
[[94mProfiler[0m] @ 17:58:43: Fold 1 - Testing model XGB
[[94mProfiler[0m] @ 17:58:44: Fold 1 - Tested model XGB
[[94mProfiler[0m] @ 17:58:44: Fold 1 - Testing model Forest
[[94mProfiler[0m] @ 17:58:44: Fold 1 - Tested model Forest
[[94mProfiler[0m] @ 17:58:44: Fold 1 - Testing model Naive-B
[[94mProfiler[0m] @ 17:58:44: Fold 1 - Tested model Naive-B
[[94mProfiler[0m] @ 17:58:44: Fold 1 - Testing model SVM_T
[[94mProfiler[0m] @ 17:58:46: Fold 1 - Tested model SVM_T
[[94mProfiler[0m] @ 17:58:46: Fold 1 - Testing model RNC
[[94mProfiler[0m] @ 17:58:47: Fold 1 - Tested model RNC
[[94mProfiler[0m] @ 17:58:47: Fold 2 - Testing model XGB


Testing with the 2-nd set


[[94mProfiler[0m] @ 17:58:48: Fold 2 - Tested model XGB
[[94mProfiler[0m] @ 17:58:48: Fold 2 - Testing model Forest
[[94mProfiler[0m] @ 17:58:48: Fold 2 - Tested model Forest
[[94mProfiler[0m] @ 17:58:48: Fold 2 - Testing model Naive-B
[[94mProfiler[0m] @ 17:58:48: Fold 2 - Tested model Naive-B
[[94mProfiler[0m] @ 17:58:48: Fold 2 - Testing model SVM_T
[[94mProfiler[0m] @ 17:58:49: Fold 2 - Tested model SVM_T
[[94mProfiler[0m] @ 17:58:49: Fold 2 - Testing model RNC
[[94mProfiler[0m] @ 17:58:49: Fold 2 - Tested model RNC
[[94mProfiler[0m] @ 17:58:50: Fold 3 - Testing model XGB


Testing with the 3-rd set


[[94mProfiler[0m] @ 17:58:50: Fold 3 - Tested model XGB
[[94mProfiler[0m] @ 17:58:50: Fold 3 - Testing model Forest
[[94mProfiler[0m] @ 17:58:50: Fold 3 - Tested model Forest
[[94mProfiler[0m] @ 17:58:50: Fold 3 - Testing model Naive-B
[[94mProfiler[0m] @ 17:58:50: Fold 3 - Tested model Naive-B
[[94mProfiler[0m] @ 17:58:50: Fold 3 - Testing model SVM_T
[[94mProfiler[0m] @ 17:58:52: Fold 3 - Tested model SVM_T
[[94mProfiler[0m] @ 17:58:52: Fold 3 - Testing model RNC
[[94mProfiler[0m] @ 17:58:52: Fold 3 - Tested model RNC


Testing with the 4-th set


[[94mProfiler[0m] @ 17:58:52: Fold 4 - Testing model XGB
[[94mProfiler[0m] @ 17:58:53: Fold 4 - Tested model XGB
[[94mProfiler[0m] @ 17:58:53: Fold 4 - Testing model Forest
[[94mProfiler[0m] @ 17:58:53: Fold 4 - Tested model Forest
[[94mProfiler[0m] @ 17:58:53: Fold 4 - Testing model Naive-B
[[94mProfiler[0m] @ 17:58:53: Fold 4 - Tested model Naive-B
[[94mProfiler[0m] @ 17:58:53: Fold 4 - Testing model SVM_T
[[94mProfiler[0m] @ 17:58:55: Fold 4 - Tested model SVM_T
[[94mProfiler[0m] @ 17:58:55: Fold 4 - Testing model RNC
[[94mProfiler[0m] @ 17:58:55: Fold 4 - Tested model RNC
[[94mProfiler[0m] @ 17:58:55: Fold 5 - Testing model XGB


Testing with the 5-th set


[[94mProfiler[0m] @ 17:58:56: Fold 5 - Tested model XGB
[[94mProfiler[0m] @ 17:58:56: Fold 5 - Testing model Forest
[[94mProfiler[0m] @ 17:58:56: Fold 5 - Tested model Forest
[[94mProfiler[0m] @ 17:58:56: Fold 5 - Testing model Naive-B
[[94mProfiler[0m] @ 17:58:56: Fold 5 - Tested model Naive-B
[[94mProfiler[0m] @ 17:58:56: Fold 5 - Testing model SVM_T


Finished testing


[[94mProfiler[0m] @ 17:58:57: Fold 5 - Tested model SVM_T
[[94mProfiler[0m] @ 17:58:57: Fold 5 - Testing model RNC
[[94mProfiler[0m] @ 17:58:57: Fold 5 - Tested model RNC
[[94mProfiler[0m] @ 17:58:57: Tested ensemble


In [15]:
ensemble.evaluate(verbose=True)
prf.add_milestone("Finished evaluating")

Evaluating 1-th fold
	     -  Correct -      XGB -   Forest -  Naive-B -    SVM_T -      RNC -   Major.- Phrase
	   0 -        0 -       6  -       0* -       0* -       4  -       0* -       0* I know what you mean.
	   1 -        0 -       2  -       2  -       0* -       2  -       2  -       2  I don't disagree that the Yes campaign made mistakes, but they must be contextualised within the truly immense opposition they had.
	   2 -        0 -       6  -       0* -       0* -       6  -       0* -       0* Of course, this is far from the dominant narrative we hear in the media.
	   3 -        0 -       2  -       0* -       0* -       2  -       2  -       2  In principle I agree with what he is striving for, but the in practice it is not so simple.
	   4 -        0 -       2  -       0* -       0* -       2  -       0* -       0* However, I still remain flummoxed as to why expats get to vote, it is no longer their concern or future.
	   5 -        0 -       2  -       5  -       0*

	 228 -        6 -       6* -       6* -       6* -       6* -       6* -       6* There is about $800 billion in investor money in index funds and ETFs that track Russell indexes, according to Rolf Agather, managing director of North America research for FTSE Russell.
	 229 -        6 -       6* -       6* -       6* -       6* -       6* -       6* Figures published today show the average age of the population now stands at 40 – the highest ever estimated.
	 230 -        6 -       6* -       6* -       3  -       6* -       6* -       6* As you can see, we had been in the European Union for just over two years and we were already holding a referendum as to whether to leave!
	 231 -        6 -       6* -       6* -       3  -       6* -       6* -       6* The original report, which was drafted mainly by President of the European Council Van Rompuy, painted a picture of banking, fiscal and political union.
	 232 -        6 -       6* -       6* -       6* -       6* -       6* -      

[[94mProfiler[0m] @ 17:58:58: Finished evaluating


In [16]:
print("Profiler data:")
prf.exhibit(max_d=10)

Profiler data:
  0: Start - 15/04/2019 - 17:21:35
  1: Created ensemble object - 15/04/2019 - 17:21:35 - 0.09s
  2: Set encoder & tokenizer - 15/04/2019 - 17:21:36 - 0.13s
  3: Read file - 15/04/2019 - 17:21:36 - 0.18s
  4: Aggregated data - 15/04/2019 - 17:21:36 - 0.00s
  5: Set datasets - 15/04/2019 - 17:21:36 - 0.07s
  6: Fitted encoder & tokenizer - 15/04/2019 - 17:21:36 - 0.18s
  7: Split data - 15/04/2019 - 17:21:36 - 0.09s
  8: Added models - 15/04/2019 - 17:21:36 - 0.32s
  9: Starting training - 15/04/2019 - 17:21:36 - 0.01s
 10: Fold 1 - Training model XGB - 15/04/2019 - 17:21:36 - 0.12s
 11: Fold 1 - Trained model XGB - 15/04/2019 - 17:25:38 - 241.39s
 12: Fold 1 - Training model Forest - 15/04/2019 - 17:25:38 - 0.00s
 13: Fold 1 - Trained model Forest - 15/04/2019 - 17:25:40 - 1.85s
 14: Fold 1 - Training model Naive-B - 15/04/2019 - 17:25:40 - 0.00s
 15: Fold 1 - Trained model Naive-B - 15/04/2019 - 17:25:40 - 0.03s
 16: Fold 1 - Training model SVM_T - 15/04/2019 - 17:25:40