In [2]:
import utils

import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import jaccard_similarity_score

import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import scipy

In [3]:
def load_data(path):
    data = pd.read_csv(path)
    data['Жалобы (ngramm)'] = data['Жалобы (ngramm)'].fillna('')
    
    return data

In [4]:
train = load_data('data/train_data_complaints_repeats_doctors.csv')

In [5]:
train.head(3)

Unnamed: 0.1,Unnamed: 0,Id_Записи,Id_Пациента,Возраст,Диагноз,Жалобы,Источник_рекламы,Клиника,Код_диагноза,Пол,Услуга,Жалобы (ngramm),Жалобы (unigramm),Врач,Повторный приём
0,0,0,115819,54,Гипертензивная болезнь сердца [гипертоническая...,"на повышение ад утром до 140/90 мм.рт.ст., пер...",Другое,5,I11,2,"Прием врача-кардиолога повторный, амбулаторный",повышение_ада,повышение ада утром мм рт ст периодич головокр...,кардиолог,1
1,1,1,399973,32,Доброкачественное новообразование молочной железы,На наличие опухоли в левой молочной железе,Другое,3,D24,2,"Прием врача-онколога (маммолога), повторный, а...",наличие_опухоль левый_молочный_железо,наличие опухоль левый молочный железо,онколог,1
2,2,2,427563,72,Простой хронический бронхит,Активных жалоб нет.,Интернет,6,J41.0,2,Прием первичный врача-пульмонолога,активный_жалоба,активный жалоба,пульмонолог,0


In [6]:
complaints = train['Жалобы (unigramm)']
tfidf_complaints = TfidfVectorizer(ngram_range=(1,1), min_df=10, stop_words=stopwords.words('russian'))
tfidf_complaints.fit_transform(complaints)

<61976x1446 sparse matrix of type '<class 'numpy.float64'>'
	with 365263 stored elements in Compressed Sparse Row format>

In [7]:
class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]

class DoctorsPopularityTransformator(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        doctors = x.fillna('sss')
        doctors_voc, counts = np.unique(doctors, return_counts=True)
        self.pop_doctor = doctors_voc[np.argsort(counts)[::-1][0]]
        
        return self

    def transform(self, x):
        x = x.fillna('sss')
        x[x == 'sss'] = self.pop_doctor
        
        return x
    
class GenderTransformator(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        x = x.copy()
        x[x == 1] = 0
        x[x == 2] = 1
        
        return np.expand_dims(x, axis=1)
    
class AgeTransformator(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        return np.expand_dims(x, axis=1)

In [8]:
def preproc_target_train(data, reduce_classes=False):
    diagnoses = data['Код_диагноза'].copy()
    
    if reduce_classes:
        pop_diagnoses = set(utils.get_most_popular_diagnoses(diagnoses, percent=.80))
        most_pop_diagnose = scipy.stats.mode(diagnoses)[0][0]
    else:
        pop_diagnoses = set(diagnoses)
        most_pop_diagnose = scipy.stats.mode(diagnoses)[0][0]
    
    diagnoses = diagnoses.apply(
        lambda diag: diag if diag in pop_diagnoses else most_pop_diagnose
    )
    
    return diagnoses, pop_diagnoses, most_pop_diagnose

def preproc_target_test(data, pop_diagnoses, most_pop_diagnose):
    diagnoses = data['Код_диагноза'].copy()
    
    diagnoses = diagnoses.apply(
        lambda diag: diag if diag in pop_diagnoses else most_pop_diagnose
    )
    
    return diagnoses, pop_diagnoses, most_pop_diagnose

In [33]:
pipe = Pipeline([
    ('union', FeatureUnion(
        transformer_list = [
            ('complaints_pipe', Pipeline([
                ('complaint_selector', ItemSelector(key='Жалобы (unigramm)')),
                ('tfidf', TfidfVectorizer(ngram_range=(1,1), min_df=10, stop_words=stopwords.words('russian')))
            ])),
#             ('complaints_n_pipe', Pipeline([
#                 ('complaint_n_selector', ItemSelector(key='Жалобы (ngramm)')),
#                 ('tfidf', TfidfVectorizer(ngram_range=(1,1), min_df=1, stop_words=stopwords.words('russian')))
#             ])),
#             ('doctor_pipe', Pipeline([
#                 ('doctor_selector', ItemSelector(key='Врач')),
#                 ('doc_pop', DoctorsPopularityTransformator()),
#                 ('count_vect', CountVectorizer())
#             ])),
#             ('gender_pipe', Pipeline([
#                 ('gender_selector', ItemSelector(key='Пол')),
#                 ('gender_transform', GenderTransformator())
#             ])),
#             ('age_pipe', Pipeline([
#                 ('age_selector', ItemSelector(key='Возраст')),
#                 ('age_transformator', AgeTransformator())
#             ]))
        ]
    )),
    ('clf', RandomForestClassifier(n_estimators=1000, n_jobs=-1, max_depth=20))
])

In [62]:
train = load_data('data/train_data_complaints_repeats_doctors.csv')
train, valid = train_test_split(train, test_size=0.5)

In [63]:
train_y, pop_diagnoses, most_pop_diagnose = preproc_target_train(train, reduce_classes=True)
valid_y, _, _ = preproc_target_test(valid, pop_diagnoses, most_pop_diagnose)



In [64]:
np.unique(train_y).shape

(195,)

In [65]:
pipe.fit(train, train_y)

Pipeline(memory=None,
     steps=[('union', FeatureUnion(n_jobs=1,
       transformer_list=[('complaints_pipe', Pipeline(memory=None,
     steps=[('complaint_selector', ItemSelector(key='Жалобы (unigramm)')), ('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, e..._jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [66]:
# len(pipe.steps[-2][-1].transformer_list[1][1].steps[1][1].vocabulary_)

IndexError: list index out of range

In [77]:
pred = pipe.predict(valid)

In [78]:
jaccard_similarity_score(np.array(valid_y), pred)

0.32838518136052663

In [79]:
np.unique(pred, return_counts=True)

(array(['B07', 'B37.3+', 'D23.9', 'D24', 'D25', 'H52.1', 'H60', 'H65',
        'H68.1', 'I11', 'I83.9', 'J00', 'J01.0', 'J01.8', 'J06.9', 'J30',
        'J30.1', 'J30.3', 'J31.0', 'J31.1', 'J34.2', 'J35.0', 'J35.8',
        'K21.0', 'K29.5', 'K29.9', 'K30', 'M41', 'M42', 'M42.1', 'M51.3',
        'M53.0', 'M54', 'M54.2', 'M54.4', 'M54.5', 'M65', 'N41.1', 'N60.1',
        'N72', 'N76.0', 'N76.1', 'N77.1*', 'N86', 'Z00.0', 'Z01.4',
        'Z01.8', 'Z32.1', 'Z34.0'], dtype=object),
 array([   38,    66,    55,     1,     1,   218,     2,     2,     3,
          374,    49,   248,   360,     1, 23375,     1,    19,    80,
            4,    21,     7,   262,     9,     4,    95,     2,   585,
            5,     1,  2126,    22,     5,    18,     7,    91,    34,
           62,   108,   300,    23,   760,    53,   251,     2,     2,
            3,   477,   739,    17]))

## Submit

In [48]:
train = load_data('data/train_data_complaints_repeats_doctors.csv')
train_y, pop_diagnoses, most_pop_diagnose = preproc_target_train(train, reduce_classes=True)



In [49]:
pipe.fit(train, train_y)

Pipeline(memory=None,
     steps=[('union', FeatureUnion(n_jobs=1,
       transformer_list=[('complaints_pipe', Pipeline(memory=None,
     steps=[('complaint_selector', ItemSelector(key='Жалобы (unigramm)')), ('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, e..._jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [71]:
test = load_data('data/test_data_complaints_repeats_doctors.csv')

In [73]:
test_pred = pipe.predict(test)

In [74]:
submit = pd.DataFrame({'Id_Записи': test['Id_Записи'], 'Код_диагноза': test_pred})

In [75]:
submit['Код_диагноза'].value_counts()

J06.9     22263
M42.1      2101
N76.0       835
Z32.1       810
K30         539
Z01.8       490
J01.0       358
I11         328
J35.0       293
N60.1       281
J00         265
N77.1*      241
H52.1       220
N41.1       114
M54.4       114
K29.5        92
B37.3+       73
J30.3        66
M65          63
I83.9        61
D23.9        58
N76.1        57
B07          39
M54.5        33
N72          28
M51.3        24
J30.1        21
J31.1        21
J34.2        16
M54          15
Z34.0        13
J35.8        10
J31.0         6
K29.9         5
K21.0         5
M54.2         4
M41           4
H68.1         4
J30           3
H65           3
M53.0         3
H52.0         2
E04.1         2
M42           2
Z01.4         2
J31.2         2
N84.0         2
K86.1         1
J03           1
J01.8         1
D24           1
G44.2         1
K29.3         1
H61.2         1
M53.8         1
H60           1
Name: Код_диагноза, dtype: int64

In [76]:
submit.to_csv('submit/bow_rfc_diag_200_compl_uni_w-o_age_gender.csv', header=True, index=False)