In [None]:
!pip install tqdm

In [18]:
import pandas as pd
import numpy as np
import re

from tqdm import tqdm
from collections import defaultdict
from sklearn.utils.extmath import softmax
from matplotlib import pyplot as plt
from collections import Counter

%matplotlib inline

In [2]:
train_topic_vectors = np.load('topics_train_ngramm.npy')
test_topic_vectors = np.load('topics_test_ngramm.npy')

train_topic_vectors[train_topic_vectors < 0.0001] = 0
test_topic_vectors[test_topic_vectors < 0.0001] = 0

topics_count = train_topic_vectors.shape[1]
topics_count

355

In [3]:
train_df = pd.read_csv('train_data_complaints_repeats_doctors.csv').fillna('')
test_df = pd.read_csv('test_data_complaints_repeats_doctors.csv').fillna('')

In [65]:
active_topics = 250
topic_names = ['topic_{}'.format(i) for i in range(active_topics)]

train_df[topic_names] = pd.DataFrame(train_topic_vectors[:, :active_topics])
test_df[topic_names] = pd.DataFrame(train_topic_vectors[:, :active_topics])

In [66]:
freq_diagnoses = train_df.groupby(['Id_Пациента']).apply(lambda x: np.argmax(x.Код_диагноза.value_counts()))

In [67]:
MAX_AGE = 98

train_df.Возраст = np.maximum(train_df.Возраст.values, MAX_AGE)
test_df.Возраст = np.maximum(test_df.Возраст.values, MAX_AGE)

In [73]:
min_freq_threshold = 5
classes_count = (train_df.Код_диагноза.value_counts() > min_freq_threshold).sum() + 1
classes_count

767

In [74]:
classes = list((train_df.Код_диагноза.value_counts() > min_freq_threshold)[
    (train_df.Код_диагноза.value_counts() > min_freq_threshold)].keys())

In [75]:
class_to_idx = {c: i for i, c in enumerate(classes, start=1)}
idx_to_class = {i: c for i, c in enumerate(classes, start=1)}

In [76]:
def df2vw(features_extractor, df, out_filename, train=True):
    file = open(out_filename, 'w', encoding='utf8')
    for row in tqdm(df.iterrows()):
        features = features_extractor(row)
        if train:
            label = class_to_idx.get(row[1].Код_диагноза, len(class_to_idx) + 1)
        else:
            label = ''
        file.write('{} {}\n'.format(label, features))
    file.close()

In [77]:
def extract_ngram_body(row, ngram=3):
    body_text = row[1]['Жалобы']
    body_text = re.sub('[\|\:]', '䷀', body_text)
    body_text = re.sub('\n', '䷚', body_text)
    
    def compute_word_ngrams(counter, word, ngram):
        for index in range(1, len(word) - ngram + 1):
            counter[word[index:(index + ngram)]] += 1
            
    ngrams_dict = Counter()
    for word in body_text.split(' '):
        compute_word_ngrams(ngrams_dict, word, ngram)
        
    return ' '.join('{}:{}'.format(*item) for item in ngrams_dict.items())

In [78]:
def extract_words(row):
    return '|doctor ' + str(row[1].Врач) + ' ' + \
           '|clinic ' + str(row[1].Клиника) + ' ' + \
           '|gender ' + str(row[1].Пол) + ' ' + \
           '|age_feature:' + str(MAX_AGE) + ' age:' + str(row[1].Возраст) + ' ' + \
           '|topics ' + ' '.join(topic_name + ':' + str(row[1][topic_name])
                                  for topic_name in topic_names) + ' ' + \
           '|char_ngrams ' + extract_ngram_body(row) + ' ' + \
           '|complaint_ngram ' + str(row[1]['Жалобы (ngramm)']) + ' ' + \
           '|complaint_uni ' + str(row[1]['Жалобы (unigramm)'])

In [79]:
df2vw(extract_words, train_df, 'train.vw')

61976it [01:45, 586.51it/s]


In [80]:
df2vw(extract_words, test_df, 'test.vw', train=False)

30000it [00:50, 597.49it/s]


In [81]:
! rm cache_file.vw

In [82]:
! vw -d train.vw --loss_function logistic --oaa $classes_count -f model --threads -b 22 --passes 5 --cache_file cache_file.vw

final_regressor = model
Num weight bits = 22
learning rate = 0.5
initial_t = 0
power_t = 0.5
decay_learning_rate = 1
creating cache_file = cache_file.vw
Reading datafile = train.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.000000 1.000000            1            1.0        9        1       48
1.000000 1.000000            2            2.0      156        9       32
1.000000 1.000000            4            4.0       15      156       43
1.000000 1.000000            8            8.0       13      156       22
1.000000 1.000000           16           16.0      126      767       50
0.937500 0.875000           32           32.0       16       26       61
0.937500 0.937500           64           64.0      191      767       16
0.898438 0.859375          128          128.0        1        1      100
0.890625 0.882812          256          256.0       45       13       84


In [83]:
! vw -i model -t test.vw -r pred.out --threads

only testing
raw predictions = pred.out
Num weight bits = 22
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = test.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
    n.a.     n.a.            1            1.0  unknown      767       61
    n.a.     n.a.            2            2.0  unknown      767       32
    n.a.     n.a.            4            4.0  unknown      767       22
    n.a.     n.a.            8            8.0  unknown      767       58
    n.a.     n.a.           16           16.0  unknown      767       21
    n.a.     n.a.           32           32.0  unknown      767       83
    n.a.     n.a.           64           64.0  unknown        1       91
    n.a.     n.a.          128          128.0  unknown      767       30
    n.a.     n.a.          256          256.0  unknown      767       38
    n.a.     n.a.          512

In [87]:
def vw2probs(pred_name):
    predictions = []
    with open(pred_name) as file:
        for line in file.readlines():
            pred = [float(token.split(':')[1]) for token in line.split()]
            predictions.append(softmax(np.array([pred]))[0].argmax() + 1) # Predictions are from 1 to classes_count
    return np.array(predictions)

In [88]:
result = vw2probs('pred.out')

In [90]:
nontrivial_diagnosis = (result != classes_count).sum()
nontrivial_diagnosis

4553

In [91]:
test_df['Код_диагноза'] = [idx_to_class.get(i, '') for i in result]

In [92]:
test_df['Код_диагноза'][test_df['Код_диагноза'] == ''] = test_df[test_df['Код_диагноза'] == '']\
                            .apply(lambda row: freq_diagnoses.get(row.Id_Пациента, 'J06.9'), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [93]:
test_df[['Id_Записи', 'Код_диагноза']].to_csv('./submission_vw.csv', index=False, encoding='utf8')