## Import packages

In [1]:
from pathlib import Path
from collections import Counter
from random import shuffle

In [2]:
import numpy as np

## Prepare dataset

In [3]:
counter = Counter()
sequences = Path('bare/')

In [4]:
dataset = [file for sequence in sorted(sequences.iterdir()) for file in sorted(sequence.glob('*.txt'))]

In [5]:
shuffle(dataset)
trainset = dataset[:int(len(dataset) * .7)]
testset = dataset[int(len(dataset) * .7):]

In [6]:
def read(file):
    with open(file) as f:
        for i, line in enumerate(f):
            if i == 2:
                return line.split()

In [7]:
for file in trainset:
    counter.update(read(str(file)))

### remove unused

In [8]:
removes = list(filter(lambda w: not w.isalpha() or len(w) == 1, counter))

In [9]:
for word in removes:
    del counter[word]

## Feature extraction

In [10]:
feature_size = 3000

In [11]:
dictionary = counter.most_common(feature_size)

In [12]:
ID = {word: i for i, (word, _) in enumerate(dictionary)}

In [13]:
def extract(words, dictionary):
    feature = np.zeros(len(dictionary), dtype=np.int32)
    
    for word in filter(lambda w: w in ID, set(words)):
        feature[ID[word]] = words.count(word)
    
    return feature

In [14]:
train_features = np.array([extract(read(str(file)), dictionary) for file in trainset])
train_labels = np.array([file.name.startswith('spm') for file in trainset], dtype=np.bool)

## Fit classifier

In [15]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.metrics import confusion_matrix

In [16]:
# model = MultinomialNB()
model = LinearSVC()

In [17]:
model.fit(train_features, train_labels)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

## Test classifier

In [18]:
from sklearn.metrics import confusion_matrix

In [19]:
test_features = np.array([extract(read(str(file)), dictionary) for file in testset])
test_labels = np.array([file.name.startswith('spm') for file in testset], dtype=np.bool)

In [20]:
prediction = model.predict(test_features)

In [21]:
acc = (prediction == test_labels).sum() / test_labels.size

In [25]:
print(confusion_matrix(test_labels, prediction))

[[704   6]
 [  4 154]]


In [23]:
print(f'Accuracy: {acc}')

Accuracy: 0.988479262672811
