In [20]:
import pandas as pd
import numpy as np
import json
import io

In [21]:
def load_file(filename):
    data = []
    with io.open(filename, 'r', encoding='utf8') as stream:
        for line in stream.readlines():
            data.append(json.loads(line))
    return data

In [22]:
train_data = load_file('train_data.json')
test_data = load_file('test_data.json')

print('Train and test loaded in memory')

Train and test loaded in memory


In [23]:
end_of_sentences = set()
for paragraph in train_data:
    for sentence in paragraph['Sentences']:
        end_of_sentences.add(sentence[-1])

In [24]:
print('|'.join(end_of_sentences))

!|"|…|.|»|?


In [25]:
from collections import defaultdict

good_ends = [u'!', u'…', u'.', u'?', u'"', u'»']
#why initially not all ends of sentences were included?

good_end_index = defaultdict(lambda: -1)
for i, end in enumerate(good_ends):
    good_end_index[end] = i


In [26]:
window_size = 3

from collections import namedtuple
TrainItem = namedtuple('TrainItem', ['text_before', 'symbol', 'text_after','is_end'])

BEFORE_SYMBOL = '^'
AFTER_SYMBOL = '_'

def get_before_text(paragraph, i):
    before_text = paragraph[max(0, i - window_size):i]
    before_text = (BEFORE_SYMBOL * (window_size - len(before_text))) + before_text
    return before_text

def get_after_text(paragraph, i):
    after_text = paragraph[i + 1:i + window_size + 1]
    after_text = after_text + (AFTER_SYMBOL * (window_size - len(after_text)))
    return after_text
        
def parse_train_instance(paragraph, sentences):
    is_index_end = [False for _ in range(len(paragraph))]
    current_sum = 0
    for sentence in sentences:
        current_sum += len(sentence)
        if current_sum > len(paragraph):
            break
        is_index_end[current_sum - 1] = True
        current_sum += 1
    for i in range(len(paragraph)):
        if paragraph[i] not in good_ends:
            continue
        before_text = get_before_text(paragraph, i)
        after_text = get_after_text(paragraph, i)
        yield TrainItem(before_text, paragraph[i], after_text, is_index_end[i])

In [27]:
trains = []
for train in train_data:
    paragraph_trains = list(parse_train_instance(train['Paragraph'], train['Sentences']))
    trains.extend(paragraph_trains)

In [28]:
all_windows_symbols = list(set(''.join(t.text_before + t.symbol + t.text_after for t in trains) +\
                               BEFORE_SYMBOL + AFTER_SYMBOL))
print(len(all_windows_symbols))
window_symbol_index = defaultdict(lambda: -1)
for i, symbol in enumerate(all_windows_symbols):
    window_symbol_index[symbol] = i

340


In [29]:
def calculate_features(item):
    features = []
    features.extend([0] * len(good_ends))
    assert(good_end_index[item.symbol] != -1)
    features[good_end_index[item.symbol]] = 1
    for word in [item.text_before, item.text_after]:
        cnt = [0] * len(all_windows_symbols)
        for symbol in word:
            features.append(1 if symbol.upper() == symbol else 0)
            features.append(1 if symbol.isalpha() else 0)
            features.append(1 if symbol in good_ends else 0)
            if symbol in window_symbol_index:
                cnt[window_symbol_index[symbol]] += 1
        features.extend(cnt)
    return features

In [30]:
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
 
X = list(map(calculate_features, trains))
Y = list(map(lambda t: 1 if t.is_end else 0, trains))
print('len(X) = %d, len(Y) = %d' % (len(X), len(Y)))
print('Features and labels calculated, starting training...')
clf = SGDClassifier()
clf.fit(X, Y)
print('Training finished!')

len(X) = 91811, len(Y) = 91811
Features and labels calculated, starting training...
Training finished!


In [31]:
def parse_test_instance(paragraph, index):
    before_text = get_before_text(paragraph, index)
    after_text = get_after_text(paragraph, index)
    return TrainItem(before_text, paragraph[index], after_text, True)

In [32]:
test_size = 26476
out_data = np.zeros((test_size, 1))

In [33]:
for p in test_data:
    paragraph = p['Paragraph']
    for cand in p['Marks']:
        item = parse_test_instance(paragraph, cand['Pos'])
        prediction = clf.predict([calculate_features(item)])
        out_data[cand['Index'] - 1] = prediction[0]

In [34]:
df = pd.DataFrame(out_data, columns=['Mark'], index=range(1, test_size + 1))
df.index.name = 'Id'

In [35]:
df.to_csv("submission.csv")
print('Submission is ready')

Submission is ready
