In [1]:
import utils

import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import jaccard_similarity_score

import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import scipy

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train = utils.load_data('data/train_data_complaints_repeats_doctors.csv')

In [3]:
train.head(3)

Unnamed: 0.1,Unnamed: 0,Id_Записи,Id_Пациента,Возраст,Диагноз,Жалобы,Источник_рекламы,Клиника,Код_диагноза,Пол,Услуга,Жалобы (ngramm),Жалобы (unigramm),Врач,Повторный приём
0,0,0,115819,54,Гипертензивная болезнь сердца [гипертоническая...,"на повышение ад утром до 140/90 мм.рт.ст., пер...",Другое,5,I11,2,"Прием врача-кардиолога повторный, амбулаторный",повышение_ада,повышение ада утром мм рт ст периодич головокр...,кардиолог,1
1,1,1,399973,32,Доброкачественное новообразование молочной железы,На наличие опухоли в левой молочной железе,Другое,3,D24,2,"Прием врача-онколога (маммолога), повторный, а...",наличие_опухоль левый_молочный_железо,наличие опухоль левый молочный железо,маммолог онколог,1
2,2,2,427563,72,Простой хронический бронхит,Активных жалоб нет.,Интернет,6,J41.0,2,Прием первичный врача-пульмонолога,активный_жалоба,активный жалоба,пульмонолог,0


In [4]:
class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]

class DoctorsPopularityTransformator(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        doctors = x.fillna('sss')
        doctors_voc, counts = np.unique(doctors, return_counts=True)
        self.pop_doctor = doctors_voc[np.argsort(counts)[::-1][0]]
        
        return self

    def transform(self, x):
        x = x.fillna('sss')
        x[x == 'sss'] = self.pop_doctor
        
        return x
    
class GenderTransformator(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        x = x.copy()
        x[x == 1] = 0
        x[x == 2] = 1
        
        return np.expand_dims(x, axis=1)
    
class AgeTransformator(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        return np.expand_dims(x, axis=1)
    
class TopicsTransformator(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        return x

class ClinicTransformator(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        clinics, counts = np.unique(x, return_counts=True)
        self.clinics_to_idx = dict(zip(clinics, range(len(clinics))))
        
        self.most_pop_clinic_idx = np.argmax(counts)
        
        return self
    
    def transform(self, x):
        x_vec = np.zeros((x.shape[0], len(self.clinics_to_idx)), dtype=np.float32)
        for i, clinic in enumerate(x):
            if clinic in self.clinics_to_idx:
                x_vec[i, self.clinics_to_idx[clinic]] = 1.
            else:
                x_vec[i, self.most_pop_clinic_idx] = 1.
        
        return x_vec
    
class RepeatsTransformator(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        return np.expand_dims(x, axis=1)

In [5]:
def preproc_target_train(data, reduce_classes=False):
    diagnoses = data['Код_диагноза'].copy()
    
    if reduce_classes:
        pop_diagnoses = set(utils.get_most_popular_diagnoses(diagnoses, percent=.80))
        most_pop_diagnose = scipy.stats.mode(diagnoses)[0][0]
    else:
        pop_diagnoses = set(diagnoses)
        most_pop_diagnose = scipy.stats.mode(diagnoses)[0][0]
    
    diagnoses = diagnoses.apply(
        lambda diag: diag if diag in pop_diagnoses else most_pop_diagnose
    )
    
    return diagnoses, pop_diagnoses, most_pop_diagnose

def preproc_target_test(data, pop_diagnoses, most_pop_diagnose):
    diagnoses = data['Код_диагноза'].copy()
    
    diagnoses = diagnoses.apply(
        lambda diag: diag if diag in pop_diagnoses else most_pop_diagnose
    )
    
    return diagnoses, pop_diagnoses, most_pop_diagnose

In [11]:
pipe = Pipeline([
    ('union', FeatureUnion(
        transformer_list = [
            ('complaints_pipe', Pipeline([
                ('complaint_selector', ItemSelector(key='Жалобы (unigramm)')),
                ('tfidf', TfidfVectorizer(ngram_range=(1,1), min_df=10, stop_words=stopwords.words('russian')))
            ])),
            ('complaints_n_pipe', Pipeline([
                ('complaint_n_selector', ItemSelector(key='Жалобы (ngramm)')),
                ('tfidf', TfidfVectorizer(ngram_range=(1,1), min_df=1, stop_words=stopwords.words('russian')))
            ])),
            ('doctor_pipe', Pipeline([
                ('doctor_selector', ItemSelector(key='Врач')),
                ('doc_pop', DoctorsPopularityTransformator()),
                ('count_vect', CountVectorizer())
            ])),
            ('gender_pipe', Pipeline([
                ('gender_selector', ItemSelector(key='Пол')),
                ('gender_transform', GenderTransformator())
            ])),
            ('age_pipe', Pipeline([
                ('age_selector', ItemSelector(key='Возраст')),
                ('age_transformator', AgeTransformator())
            ])),
            ('topics_pipe', Pipeline([
                ('topic_selector', ItemSelector(key=['topic' + str(i) for i in range(355)])),
                ('topics_transform', TopicsTransformator())
            ])),
            ('clinic_pipe', Pipeline([
                ('clinic_selector', ItemSelector(key='Клиника')),
                ('clinic_transform', ClinicTransformator())
            ])),
            ('repeats_pipe', Pipeline([
                ('repeats_selector', ItemSelector(key='Повторный приём')),
                ('repeats_transform', RepeatsTransformator())
            ]))
        ]
    )),
    ('clf', RandomForestClassifier(n_estimators=1000, n_jobs=-1, max_depth=20))
])

In [12]:
train = utils.load_data('data/train_data_complaints_repeats_doctors.csv')
train_topics = np.load('data/topics_train_ngramm.npy')
train = utils.join_topics(train, train_topics)
train, valid = train_test_split(train, test_size=0.3)

In [13]:
train_y, pop_diagnoses, most_pop_diagnose = preproc_target_train(train, reduce_classes=True)
valid_y, _, _ = preproc_target_test(valid, pop_diagnoses, most_pop_diagnose)



In [14]:
np.unique(train_y).shape

(198,)

In [15]:
pipe.fit(train, train_y)

Pipeline(memory=None,
     steps=[('union', FeatureUnion(n_jobs=1,
       transformer_list=[('complaints_pipe', Pipeline(memory=None,
     steps=[('complaint_selector', ItemSelector(key='Жалобы (unigramm)')), ('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, e..._jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [16]:
# len(pipe.steps[-2][-1].transformer_list[1][1].steps[1][1].vocabulary_)

In [17]:
pred = pipe.predict(valid)

In [18]:
jaccard_similarity_score(np.array(valid_y), pred)

0.3459904265045985

In [19]:
np.unique(pred, return_counts=True)

(array(['B07', 'B37.3+', 'B97.7', 'D23.9', 'D24', 'E03.8', 'E04.1',
        'E04.2', 'E89.0', 'F45.3', 'G90', 'G90.9', 'G93.4', 'H35.0',
        'H52.1', 'H52.2', 'H65', 'I10', 'I11', 'I25.1', 'I34.1', 'I67.4',
        'I83.9', 'I84.3', 'I84.5', 'J00', 'J01', 'J01.0', 'J01.8', 'J06.9',
        'J18', 'J30', 'J30.1', 'J30.3', 'J31.0', 'J31.1', 'J31.2', 'J34.2',
        'J35.0', 'J35.8', 'K21.0', 'K29.5', 'K29.9', 'K30', 'K86.1',
        'L02.0', 'L02.2', 'L02.4', 'L21', 'L30.8', 'L60.0', 'M15-M19',
        'M41', 'M42', 'M42.1', 'M43.9', 'M50', 'M51', 'M51.3', 'M51.8',
        'M53.1', 'M53.8', 'M53.9', 'M54', 'M54.2', 'M54.4', 'M54.5',
        'M54.6', 'M54.8', 'M54.9', 'M65', 'M71.8', 'M79.1', 'M79.6',
        'M81.0', 'N40', 'N41.1', 'N60.1', 'N72', 'N76.0', 'N76.1',
        'N77.1*', 'N83.0', 'N86', 'N88.0', 'N95.2', 'Z00.0', 'Z00.8',
        'Z01.0', 'Z01.4', 'Z01.8', 'Z04.8', 'Z32.1', 'Z34', 'Z34.0',
        'Z34.8'], dtype=object),
 array([   35,    20,     1,    52,     3,     1

## Submit

In [28]:
train = utils.load_data('data/train_data_complaints_repeats_doctors.csv')
train_topics = np.load('data/topics_train_ngramm.npy')
train = utils.join_topics(train, train_topics)
train_y, pop_diagnoses, most_pop_diagnose = preproc_target_train(train, reduce_classes=True)



In [29]:
pipe.fit(train, train_y)

Pipeline(memory=None,
     steps=[('union', FeatureUnion(n_jobs=1,
       transformer_list=[('complaints_pipe', Pipeline(memory=None,
     steps=[('complaint_selector', ItemSelector(key='Жалобы (unigramm)')), ('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, e..._jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [30]:
test = utils.load_data('data/test_data_complaints_repeats_doctors.csv')
test_topics = np.load('data/topics_test_ngramm.npy')
test = utils.join_topics(test, test_topics)

In [31]:
test_pred = pipe.predict(test)

In [32]:
submit = pd.DataFrame({'Id_Записи': test['Id_Записи'], 'Код_диагноза': test_pred})

In [33]:
submit['Код_диагноза'].value_counts()

J06.9      21386
M42.1       2128
N76.0       1172
Z32.1        751
K30          605
J35.0        447
J00          395
N60.1        372
I11          333
Z01.8        281
N41.1        237
H52.1        218
J01.0        173
M65          108
M51           92
N77.1*        83
J30.3         79
I83.9         78
Z34.0         75
Z00.0         74
M54.4         73
M54.5         70
B07           66
N76.1         63
D23.9         53
J35.8         44
E04.2         42
M41           35
Z01.0         32
M54.2         31
           ...  
H65            2
J41.0          2
M53.0          2
N40            2
D24            2
J30            2
M15-M19        1
N20.0          1
H10.2          1
N84.0          1
K86.1          1
B97.7          1
L02.0          1
K83.9          1
M12.5          1
M21.4          1
L30.8          1
N86            1
B02            1
I67.4          1
M71.8          1
N95.1          1
I10            1
N70            1
M51.1          1
E66.0          1
J31.0          1
H52.0         

In [34]:
submit.to_csv('submit/bow_rfc_diag_200_compl_uni-n_gram_age_gender_doctor_topics_clinic_repeats_full_train.csv', header=True, index=False)