In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm
from collections import defaultdict
from sklearn.utils.extmath import softmax

In [2]:
train_df = pd.read_csv('train_data.csv', sep=';')
test_df = pd.read_csv('test_data.csv', sep=';')
train_df.head(3)

Unnamed: 0,Id_Записи,Id_Пациента,Возраст,Диагноз,Жалобы,Источник_рекламы,Клиника,Код_диагноза,Пол,Услуга
0,0,115819,54,Гипертензивная болезнь сердца [гипертоническая...,"на повышение ад утром до 140/90 мм.рт.ст., пер...",Другое,5,I11,2,"Прием врача-кардиолога повторный, амбулаторный"
1,1,399973,32,Доброкачественное новообразование молочной железы,На наличие опухоли в левой молочной железе,Другое,3,D24,2,"Прием врача-онколога (маммолога), повторный, а..."
2,2,427563,72,Простой хронический бронхит,Активных жалоб нет.,Интернет,6,J41.0,2,Прием первичный врача-пульмонолога


In [3]:
freq_diagnoses = train_df.groupby(['Id_Пациента']).apply(lambda x: np.argmax(x.Код_диагноза.value_counts()))

In [4]:
min_freq_threshold = 5
classes_count = (train_df.Код_диагноза.value_counts() > min_freq_threshold).sum() + 1
classes_count

767

In [5]:
classes = list((train_df.Код_диагноза.value_counts() > min_freq_threshold)[
    (train_df.Код_диагноза.value_counts() > min_freq_threshold)].keys())

In [6]:
class_to_idx = {c: i for i, c in enumerate(classes, start=1)}
idx_to_class = {i: c for i, c in enumerate(classes, start=1)}

In [7]:
def df2vw(features_extractor, df, out_filename, train=True):
    file = open(out_filename, 'w', encoding='utf8')
    for row in tqdm(df.iterrows()):
        features = features_extractor(row)
        if train:
            label = class_to_idx.get(row[1].Код_диагноза, len(class_to_idx) + 1)
        else:
            label = ''
        file.write('{} {}\n'.format(label, features))
    file.close()

In [8]:
extract_words = lambda row: '|complaint ' + row[1].Жалобы.replace(':', '').lower() + \
                            '|doctor ' + row[1].Услуга.replace(':', '').lower() + \
                            '|age ' + str(row[1].Возраст)

In [9]:
df2vw(extract_words, train_df, 'train.vw')

61976it [00:04, 13216.97it/s]


In [10]:
df2vw(extract_words, test_df, 'test.vw', train=False)

30000it [00:01, 15034.63it/s]


In [11]:
! vw -d train.vw --loss_function logistic --oaa $classes_count -f model

final_regressor = model
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = train.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.000000 1.000000            1            1.0        9        1       15
1.000000 1.000000            2            2.0      156        9       14
1.000000 1.000000            4            4.0       15      156       16
1.000000 1.000000            8            8.0       13      156       14
1.000000 1.000000           16           16.0        2       23       12
0.968750 0.937500           32           32.0      688       60       11
0.984375 1.000000           64           64.0        1       60       32
0.929688 0.875000          128          128.0      136        2        9
0.917969 0.906250          256          256.0        1        1       13
0.888672 0.859375          512          512.0

In [12]:
! vw -i model -t test.vw -r pred.out

only testing
raw predictions = pred.out
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = test.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
    n.a.     n.a.            1            1.0  unknown      767       17
    n.a.     n.a.            2            2.0  unknown      767       12
    n.a.     n.a.            4            4.0  unknown      767       10
    n.a.     n.a.            8            8.0  unknown      767       20
    n.a.     n.a.           16           16.0  unknown      767       10
    n.a.     n.a.           32           32.0  unknown      767       24
    n.a.     n.a.           64           64.0  unknown      767       28
    n.a.     n.a.          128          128.0  unknown      767       12
    n.a.     n.a.          256          256.0  unknown      767       13
    n.a.     n.a.          512

In [13]:
def vw2probs(pred_name):
    predictions = []
    with open(pred_name) as file:
        for line in file.readlines():
            pred = [float(token.split(':')[1]) for token in line.split()]
            predictions.append(softmax(np.array([pred]))[0].argmax() + 1) # Predictions are from 1 to classes_count
    return np.array(predictions)

In [14]:
result = vw2probs('pred.out')

In [15]:
nontrivial_diagnosis = (result != classes_count).sum()
nontrivial_diagnosis

2680

In [16]:
test_df['Код_диагноза'] = [idx_to_class.get(i, '') for i in result]

In [17]:
test_df['Код_диагноза'][test_df['Код_диагноза'] == ''] = test_df[test_df['Код_диагноза'] == '']\
                            .apply(lambda row: freq_diagnoses.get(row.Id_Пациента, 'J06.9'), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [18]:
test_df[['Id_Записи', 'Код_диагноза']].to_csv('./submission_vw.csv', index=False, encoding='utf8')