## Create and serialize scaler and final model

----

In [23]:
import sys
import dill
import itertools
import os
import json
import csv
import numpy as np
import pandas as pd
from scipy.spatial.distance import cosine, braycurtis, canberra, cityblock, chebyshev, minkowski
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

if '..' not in sys.path:
    sys.path.append('..')

from src.document import Document    
from src.text_chunk import TextChunk

# 1. Training a model and saving it:

Import train / val as vectors

In [2]:
train = pd.read_csv('../data/train_features_scaled_standard.csv', index_col=0)
val = pd.read_csv('../data/validation_features_scaled_standard.csv', index_col=0)

In [7]:
NUMBER_OF_FEATURES = int((train.shape[1] - 1) / 2)
assert number_of_features == 938

Define similarity/distance measures

In [8]:
def minmax(a, b):
    return sum(np.minimum(a, b)) / sum(np.maximum(a, b))

def similarities(vectors):
    a = [vectors['A_{}'.format(i)] for i in range(NUMBER_OF_FEATURES)]
    b = [vectors['B_{}'.format(i)] for i in range(NUMBER_OF_FEATURES)]
    
    return (minmax(a,b),
            cosine(a, b),
            braycurtis(a, b),
            canberra(a, b),
            cityblock(a, b))

## Model - SVC based on all similarity measures, all features, no weights, standard scaling + final standard scaling of calculated similarities

In [9]:
similarity_measures = ['minmax_similarity', 'cosine_distance', 'braycurtis_distance',
                               'canberra_distance', 'cityblock_distance']

# computing train similarity measures
train_similarities = train.apply(lambda vectors: similarities(vectors), axis=1).apply(pd.Series)
train_similarities.columns = similarity_measures
train_similarities['different_author'] = train['different_author']

# computing val similarity measures
val_similarities = val.apply(lambda vectors: similarities(vectors), axis=1).apply(pd.Series)
val_similarities.columns = similarity_measures
val_similarities['different_author'] = val['different_author']

In [10]:
train_similarities.head()

Unnamed: 0,minmax_similarity,cosine_distance,braycurtis_distance,canberra_distance,cityblock_distance,different_author
0,-1.055474,0.912823,0.71696,398.480153,605.604589,True
1,-1.053584,0.916162,0.721877,400.251424,608.193625,True
2,-1.205837,0.937646,0.647488,289.809889,619.982495,True
3,-1.049284,0.763965,0.597759,326.068801,566.108841,False
4,-0.71436,0.733612,0.650287,393.313379,560.358946,False


In [11]:
t_x = train_similarities[similarity_measures]
t_y = train_similarities['different_author']
v_x = val_similarities[similarity_measures]
v_y = val_similarities['different_author']

In [12]:
similarities_scaler = StandardScaler()
similarities_scaler.fit(t_x)
t_x = similarities_scaler.transform(t_x)
v_x = similarities_scaler.transform(v_x)

In [13]:
svc = SVC()
svc.fit(t_x, t_y)

print('train accuracy: ', (svc.predict(t_x) == t_y).mean())
print('val accuracy: ', (svc.predict(v_x) == v_y).mean())

train accuracy:  0.783779207586
val accuracy:  0.779416809605


In [28]:
svc

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

Save scaler:

In [33]:
with open('../data/similarities_scaler.pk', 'wb') as f:
    dill.dump(similarities_scaler, f)

Save SVC model:

In [29]:
with open('../data/svc.pk', 'wb') as f:
    dill.dump(svc, f)

# 2. Evaluate model on final task

In [None]:
import warnings
warnings.filterwarnings('ignore')

Load needed models and scalers

In [39]:
with open('../data/standard_scaler.pk', 'rb') as f:
    VECTORS_SCALER = dill.load(f)
    
with open('../data/similarities_scaler.pk', 'rb') as f:
    SIMILARITIES_SCALER = dill.load(f)
    
with open('../data/svc.pk', 'rb') as f:
    SVC = dill.load(f)

Import raw docs

In [31]:
def get_data(path):
    data = []

    for i in itertools.count(start=1):
        try:
            text = open(os.path.join(path, 'problem-' + str(i) + '.txt'), 'r').read()
            changes = json.load(open(os.path.join(path, 'problem-' + str(i) + '.truth')))
            data.append(Document(text, **changes))
        except FileNotFoundError:
            break
    
    return data

train_docs = get_data('../data/train_raw')
validation_docs = get_data('../data/validation_raw')

In [32]:
def similarities(a, b):
    return (minmax(a,b),
            cosine(a, b),
            braycurtis(a, b),
            canberra(a, b),
            cityblock(a, b))

In [40]:
def classify_document(document):
    """
    Splits a document to 3 chunks with equal number of sentences.
    Calculates similarities between each pair of chunks and predicts different author for each pair.
    Returns True if any pair has a different author.
    """
    splits = [None, len(document.sentences)//3, (len(document.sentences)//3)*2, None]
    chunks = [TextChunk(' '.join(document.sentences[start:end]))
              for start, end in zip(splits[:-1], splits[1:])]
    vectors = map(lambda chunk: VECTORS_SCALER.transform(chunk.to_vector()), chunks)
    chunk_similarities = pd.DataFrame(columns=similarity_measures)
    
    for first_vector, second_vector in combinations(vectors, 2):
        chunk_similarities.loc[len(chunk_similarities)] = similarities(first_vector, second_vector)
        
    chunk_similarities = SIMILARITIES_SCALER.transform(chunk_similarities)
    
    return any(SVC.predict(chunk_similarities))

In [42]:
for document in validation_docs:
    document.predicted = classify_document(document)

Accuracy:

In [43]:
actual = pd.Series([document.has_changes for document in validation_docs])
predicted = pd.Series([document.predicted for document in validation_docs])

In [44]:
(actual == predicted).mean()

0.63806970509383376

## Same thing but classify documents as having a style change if at least two pairs of chunks are by a different author

### Can't have only 1 pair (out of 3 pairs) with different author?!??!

In [50]:
def classify_document(document):
    """
    Splits a document to 3 chunks with equal number of sentences.
    Calculates similarities between each pair of chunks and predicts different author for each pair.
    Returns True if at least two pairs have a different author.
    """
    splits = [None, len(document.sentences)//3, (len(document.sentences)//3)*2, None]
    chunks = [TextChunk(' '.join(document.sentences[start:end]))
              for start, end in zip(splits[:-1], splits[1:])]
    vectors = map(lambda chunk: VECTORS_SCALER.transform(chunk.to_vector()), chunks)
    chunk_similarities = pd.DataFrame(columns=similarity_measures)
    
    for first_vector, second_vector in combinations(vectors, 2):
        chunk_similarities.loc[len(chunk_similarities)] = similarities(first_vector, second_vector)
        
    chunk_similarities = SIMILARITIES_SCALER.transform(chunk_similarities)
    
    return sum(SVC.predict(chunk_similarities)) >= 2

In [51]:
for document in validation_docs:
    document.predicted = classify_document(document)

Accuracy:

In [52]:
actual = pd.Series([document.has_changes for document in validation_docs])
predicted = pd.Series([document.predicted for document in validation_docs])

In [53]:
(actual == predicted).mean()

0.65683646112600536

## Try splitting document to parts with equal length (instead of equal number of sentences)

In [None]:
pd.Series()

In [61]:
def classify_document(document):
    """
    Splits a document to 3 chunks with equal length.
    Calculates similarities between each pair of chunks and predicts different author for each pair.
    Returns True if at least 2 pairs have a different author.
    """
    document_length = len(document.text)
    chunks = [TextChunk(document.text[:document_length//3]),
              TextChunk(document.text[document_length//3:document_length//3*2]),
              TextChunk(document.text[document_length//3*2:])]

    vectors = map(lambda chunk: VECTORS_SCALER.transform(chunk.to_vector()), chunks)
    chunk_similarities = pd.DataFrame(columns=similarity_measures)
    
    for first_vector, second_vector in combinations(vectors, 2):
        chunk_similarities.loc[len(chunk_similarities)] = similarities(first_vector, second_vector)
        
    chunk_similarities = SIMILARITIES_SCALER.transform(chunk_similarities)
    
    return sum(SVC.predict(chunk_similarities)) >= 2

In [62]:
for document in validation_docs:
    document.predicted = classify_document(document)

Accuracy:

In [63]:
actual = pd.Series([document.has_changes for document in validation_docs])
predicted = pd.Series([document.predicted for document in validation_docs])

In [64]:
(actual == predicted).mean()

0.63538873994638068

## Split to two parts with equal length (instead of 3)

In [69]:
def classify_document(document):
    """
    Splits a document to 2 chunks with equal length.
    Calculates similarities between the two chunks and predicts different author.
    Returns True if the predicted value for different author is True.
    """
    document_length = len(document.text)
    chunks = [TextChunk(document.text[:document_length//2]),
              TextChunk(document.text[document_length//2:])]

    vectors = map(lambda chunk: VECTORS_SCALER.transform(chunk.to_vector()), chunks)
    chunk_similarities = pd.DataFrame(columns=similarity_measures)
    
    for first_vector, second_vector in combinations(vectors, 2):
        chunk_similarities.loc[len(chunk_similarities)] = similarities(first_vector, second_vector)
        
    chunk_similarities = SIMILARITIES_SCALER.transform(chunk_similarities)
    
    return any(SVC.predict(chunk_similarities))

In [None]:
for document in validation_docs:
    document.predicted = classify_document(document)

Accuracy:

In [None]:
actual = pd.Series([document.has_changes for document in validation_docs])
predicted = pd.Series([document.predicted for document in validation_docs])

In [None]:
(actual == predicted).mean()

## Split to two parts with equal number of sentences

In [None]:
def classify_document(document):
    """
    Splits a document to 2 chunks with equal number of sentences.
    Calculates similarities between the two chunks and predicts different author.
    Returns True if the predicted value for different author is True.
    """
    number_of_sentences = len(document.sentences)
    chunks = [TextChunk(' '.join(document.sentences[:number_of_sentences//2])),
              TextChunk(' '.join(document.sentences[number_of_sentences//2:]))]

    vectors = map(lambda chunk: VECTORS_SCALER.transform(chunk.to_vector()), chunks)
    chunk_similarities = pd.DataFrame(columns=similarity_measures)
    
    for first_vector, second_vector in combinations(vectors, 2):
        chunk_similarities.loc[len(chunk_similarities)] = similarities(first_vector, second_vector)
        
    chunk_similarities = SIMILARITIES_SCALER.transform(chunk_similarities)
    
    return any(SVC.predict(chunk_similarities))

In [None]:
for document in validation_docs:
    document.predicted = classify_document(document)

Accuracy:

In [None]:
actual = pd.Series([document.has_changes for document in validation_docs])
predicted = pd.Series([document.predicted for document in validation_docs])

In [None]:
(actual == predicted).mean()