### Import Libraries

In [14]:
from collections import OrderedDict, Counter
import torch
import numpy as np
import tensorboardX
import dill as pickle

In [2]:
import sys
sys.path.append("sopa_master/")

In [34]:
from data import read_embeddings, read_docs, read_labels
from soft_patterns import ProbSemiring, MaxPlusSemiring, LogSpaceMaxTimesSemiring, SoftPatternClassifier, train, Batch, evaluate_accuracy
from util import to_cuda
from interpret_classification_results import interpret_documents
from visualize import visualize_patterns
from baseline.pattern_extractor import train, Word, add_patterns, build_trie, lil_matrix

### Files

In [8]:
train_data_file = "data/time_data_clean/train.data"
train_label_file ="data/time_data_clean/train.labels"
dev_data_file = "data/time_data_clean/dev.data"
dev_label_file = "data/time_data_clean/dev.labels"
test_file = "data/time_data_clean/test.data"
test_label="data/time_data_clean/test.labels"

### Embeddings

In [9]:
vocab = pickle.load(open("vocab.p","rb"))
embeddings = pickle.load(open("embeddings.p","rb"))
word_dim = pickle.load(open("word_dim.p","rb"))

### Data Preprocessing

In [12]:
with open(train_data_file, encoding="ISO-8859-1") as ifh:
        wordcount = Counter(ifh.read().split())

In [18]:
sum = np.sum(list(wordcount.values()))
wordcount = {k: float(wordcount[k])/int(sum) for k in wordcount.keys()}
words = {k: Word(k, wordcount[k], 0.0001, 0.01) for k in wordcount.keys()}

In [20]:
patterns = dict()

with open(train_data_file, encoding='ISO-8859-1') as input_file:
    train_docs = [line.rstrip().split() for line in input_file]

with open(dev_data_file, encoding='ISO-8859-1') as input_file:
    dev_docs = [line.rstrip().split() for line in input_file]

with open(test_file, encoding='ISO-8859-1') as input_file:
    test_docs = [line.rstrip().split() for line in input_file]

In [22]:
train_labels = read_labels(train_label_file)
dev_labels = read_labels(dev_label_file)
test_labels = read_labels(test_label)

In [26]:
for doc in train_docs:
    add_patterns(doc, words, patterns, 6, True, 1)

In [27]:
thr = 0.001*len(train_docs)

In [28]:
patterns = {k: patterns[k] for k in patterns.keys() if patterns[k] >= thr}

In [30]:
s = 0
for p in patterns.keys():
    p.set_freq(patterns[p])
    s += patterns[p]

In [31]:
pattern_keys = list(patterns.keys())

In [33]:
trie = build_trie(pattern_keys)

In [35]:
train_features = lil_matrix((len(train_docs), len(patterns)), dtype=np.int8)
dev_features = lil_matrix((len(dev_docs), len(patterns)))
test_features = lil_matrix((len(test_docs), len(patterns)))

In [36]:
for (i, doc) in enumerate(train_docs):
    add_patterns(doc, words, patterns, 6, True, 1, trie, train_features, i)

for (i, doc) in enumerate(dev_docs):
    add_patterns(doc, words, patterns, 6, True, 1, trie, dev_features, i)

for (i, doc) in enumerate(test_docs):
    add_patterns(doc, words, patterns, 6, True, 1, trie, test_features, i)

### Classifier

In [37]:
clf = train(train_features, train_labels, dev_features, dev_labels)

Testing 1
Train: 0.9935483870967742, dev: 0.8127147766323024
Testing 0.5
Train: 0.9853763440860215, dev: 0.8092783505154639
Testing 0.1
Train: 0.9113978494623656, dev: 0.7783505154639175
Testing 0.05




Train: 0.8576344086021506, dev: 0.7474226804123711
Testing 0.01
Train: 0.7281720430107527, dev: 0.6718213058419243
Testing 0.005
Train: 0.6946236559139785, dev: 0.6460481099656358
Testing 0.001
Train: 0.68, dev: 0.6460481099656358
Num of params =  (7688,)


### Results

In [38]:
pred = clf.predict(test_features)

In [40]:
(pred == test_labels).mean()

0.8321870701513068