In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import scipy.stats
matplotlib.style.use('ggplot')
import os
%matplotlib inline

os.chdir("E:\\work\\data-science\\epam-course\\classification1")
df = pd.read_excel("dataset_57_hypothyroid.xlsx")
df = df.replace('?', np.nan)
df.head()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,Class
0,41.0,F,f,f,f,f,f,f,f,f,...,t,125.0,t,1.14,t,109.0,f,,SVHC,negative
1,23.0,F,f,f,f,f,f,f,f,f,...,t,102.0,f,,f,,f,,other,negative
2,46.0,M,f,f,f,f,f,f,f,f,...,t,109.0,t,0.91,t,120.0,f,,other,negative
3,70.0,F,t,f,f,f,f,f,f,f,...,t,175.0,f,,f,,f,,other,negative
4,70.0,F,f,f,f,f,f,f,f,f,...,t,61.0,t,0.87,t,70.0,f,,SVI,negative


Проведем кодирование категориальных данных:

1. f/t заменим на 0/1
2. для referral_source, Class, sex будем использовать One Hot Encoding 


In [2]:
C = np.logspace(0, 4, 10) # 1 / lambda
print(C)

boolean = ["on_thyroxine", "query_on_thyroxine"
           , "on_antithyroid_medication"
           , "sick", "pregnant", "thyroid_surgery"
           , "I131_treatment", "query_hypothyroid"
           , "query_hyperthyroid", "lithium", "goitre"
           , "tumor", "hypopituitary", "psych", "TSH_measured"
           , "T3_measured", "TT4_measured", "T4U_measured"
           , "FTI_measured", "TBG_measured"]

for col in boolean:
    df[col] = df[col].replace("f", 0)
    df[col] = df[col].replace("t", 1)
df.head()

[1.00000000e+00 2.78255940e+00 7.74263683e+00 2.15443469e+01
 5.99484250e+01 1.66810054e+02 4.64158883e+02 1.29154967e+03
 3.59381366e+03 1.00000000e+04]


Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,Class
0,41.0,F,0,0,0,0,0,0,0,0,...,1,125.0,1,1.14,1,109.0,0,,SVHC,negative
1,23.0,F,0,0,0,0,0,0,0,0,...,1,102.0,0,,0,,0,,other,negative
2,46.0,M,0,0,0,0,0,0,0,0,...,1,109.0,1,0.91,1,120.0,0,,other,negative
3,70.0,F,1,0,0,0,0,0,0,0,...,1,175.0,0,,0,,0,,other,negative
4,70.0,F,0,0,0,0,0,0,0,0,...,1,61.0,1,0.87,1,70.0,0,,SVI,negative


### Посмотрим на пропуски

In [3]:
df.info()
df.apply(lambda x: sum(x.isnull()), axis=0)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3772 entries, 0 to 3771
Data columns (total 30 columns):
age                          3771 non-null float64
sex                          3622 non-null object
on_thyroxine                 3772 non-null int64
query_on_thyroxine           3772 non-null int64
on_antithyroid_medication    3772 non-null int64
sick                         3772 non-null int64
pregnant                     3772 non-null int64
thyroid_surgery              3772 non-null int64
I131_treatment               3772 non-null int64
query_hypothyroid            3772 non-null int64
query_hyperthyroid           3772 non-null int64
lithium                      3772 non-null int64
goitre                       3772 non-null int64
tumor                        3772 non-null int64
hypopituitary                3772 non-null int64
psych                        3772 non-null int64
TSH_measured                 3772 non-null int64
TSH                          3403 non-null float64
T3_mea

age                             1
sex                           150
on_thyroxine                    0
query_on_thyroxine              0
on_antithyroid_medication       0
sick                            0
pregnant                        0
thyroid_surgery                 0
I131_treatment                  0
query_hypothyroid               0
query_hyperthyroid              0
lithium                         0
goitre                          0
tumor                           0
hypopituitary                   0
psych                           0
TSH_measured                    0
TSH                           369
T3_measured                     0
T3                            769
TT4_measured                    0
TT4                           231
T4U_measured                    0
T4U                           387
FTI_measured                    0
FTI                           385
TBG_measured                    0
TBG                          3772
referral_source                 0
Class         

Колонку TBG можно смело удалять

In [4]:
del df['TBG']

In [5]:
def missed_ids(data):
    ids = set()
    for i in range(0, len(data)):
        if pd.isnull(data[i]):
            ids.add(i)
    return ids

def common_missings(df, col_names):
    for col1 in col_names:
        ids1 = missed_ids(df[col1])
        for col2 in col_names:
            if (col1 == col2):
                continue
            ids2 = missed_ids(df[col2])
            print(col1 + ", " + col2 + ": " + str(len(ids1.intersection(ids2))))
        
# with_missings = ['age', 'sex', 'TSH', 'T3', 'TT4', 'FTI']
# common_missings(df, with_missings)

Посмотрим на распределение в колонках с пропусками, и произведем вставку пропущенных значений

In [6]:
from scipy.stats import shapiro

with_missings = ['age', 'sex', 'TSH', 'T3', 'TT4', 'FTI', 'T4U']
for col in with_missings:
    try:
        stat, p = shapiro(df[col])
        print(col + ' p=%.3f' % (p))
    except TypeError:
        continue

age p=1.000
TSH p=1.000
T3 p=1.000
TT4 p=1.000
FTI p=1.000
T4U p=1.000


Данные с пропусками имеют нормальное распределение. Заменим пропуски средним значением

In [7]:
for col in with_missings:
    try:
        df[col] = df[col].replace(np.nan, np.mean(df[col]))
    except TypeError:
        continue

In [8]:
df.apply(lambda x: sum(x.isnull()), axis=0)

age                            0
sex                          150
on_thyroxine                   0
query_on_thyroxine             0
on_antithyroid_medication      0
sick                           0
pregnant                       0
thyroid_surgery                0
I131_treatment                 0
query_hypothyroid              0
query_hyperthyroid             0
lithium                        0
goitre                         0
tumor                          0
hypopituitary                  0
psych                          0
TSH_measured                   0
TSH                            0
T3_measured                    0
T3                             0
TT4_measured                   0
TT4                            0
T4U_measured                   0
T4U                            0
FTI_measured                   0
FTI                            0
TBG_measured                   0
referral_source                0
Class                          0
dtype: int64

150 пропусков для колонки sex. Это порядка 4% данных. Удалим эти данные

In [9]:
df_raw = df.copy()
df = df.dropna(how='any')

In [10]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer

class OheTransform(BaseEstimator, TransformerMixin):
    def __init__(self, cols):
        self._column_transformer = make_column_transformer(
            (OneHotEncoder(), cols)
        )
        self._cols = cols
        
    def transform(self, X, *_):
        tmp = pd.DataFrame(self._column_transformer.fit_transform(X).toarray()
                           , columns = self._column_transformer.get_feature_names())
        return pd.concat([X.drop(self._cols, axis=1), tmp], axis=1)

    def fit(self, X, *_):
        return self._column_transformer.fit(X)

class StandardScalerTransform(BaseEstimator, TransformerMixin):
    def __init__(self, df, cols):
        self._column_transformer = make_column_transformer(
            (StandardScaler(), cols)
        )
        self._cols = cols
        self._df = df
        
    def transform(self, X, *_):
        tmp = pd.DataFrame(self._column_transformer.fit_transform(X)
                           , columns = self._cols)

        for c in tmp:
            X[c] = tmp[c]
        
        return X
    
    def fit(self, *_):
        return self._column_transformer.fit(self._df)
    
    
def ohe_transormf(df, cols):
    column_trans = make_column_transformer(
        (OneHotEncoder(), columns2encode)
    )
    column_trans.fit(df)
    column_trans.get_feature_names()
    df_trans = pd.DataFrame(column_trans.fit_transform(df).toarray(), columns = column_trans.get_feature_names())
    return df_trans

df = df_raw.copy()
df = df.dropna(how='any')

cat_columns = list(df.select_dtypes(include=['category','object']))
num_columns = list(df.select_dtypes(include=['float64']))

#pipe = make_pipeline(
#    OheTransform(cat_columns)
    #, StandardScalerTransform(df, num_columns)
#    , LogisticRegression()
#)
#train_test_split(X, y, test_size=0.33, random_state=1)
#preprocessor = ColumnTransformer(
#    transformers=[
#        ('num', StandardScalerTransform(df, num_columns), num_columns),
#        ('cat', OheTransform(cat_columns), cat_columns)])
#pipe = Pipeline(steps=[('preprocessor', preprocessor)])

#pipe = Pipeline(steps=[('preprocessor', preprocessor)
#                       , ('classifier', LogisticRegression())])

X = df.drop('sick', axis='columns')
y = df['sick']

continuous_transformer = Pipeline(steps = [('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps = [('lab_enc', OneHotEncoder(handle_unknown='ignore'))])

continuous_features = list(X.select_dtypes(include=['float64']))
categorical_features = list(X.select_dtypes(include=['object']))

preprocessor = ColumnTransformer(
   transformers=[
       ('cont', continuous_transformer, continuous_features),
       ('cat', categorical_transformer, categorical_features)])

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE

logistic = LogisticRegression(solver='lbfgs', max_iter=10000)
selector = RFE(logistic)

penalty = [# 'l1' # Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.
            'l2'
          #  , 'none' # UserWarning: Setting penalty='none' will ignore the C and l1_ratio parameters
          # , 'elasticnet' # Only 'saga' solver supports elasticnet penalty, got solver=liblinear.
          ]
C = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

pipe = Pipeline(steps=[('preprocessor', preprocessor)
                       , ('RFE', selector)
                       , ('logistic', logistic)])
param_grid = {
    'logistic__penalty': penalty
    , 'logistic__C': C
}

clf = GridSearchCV(pipe, param_grid, cv=5, verbose=0, scoring='accuracy')
clf.fit(X, y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('cont',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('scaler',
                                                                                          StandardScaler(copy=True,
                                                                                                         with_mean=True,
                       

In [14]:
print(clf.best_score_)
#print(clf.cv_results_)
#print(clf.cv_results_)
print(clf.best_estimator_['logistic'].coef_)
print(clf.best_estimator_['logistic'].intercept_)

coefs = clf.best_estimator_['logistic'].coef_[0]
idx = []
for i in range(0, len(coefs)):
    c = coefs[i]
    if (c < -1) or (c > 1):
        idx.append(i)

print(X.columns[idx])
#print(clf.cv_results_)
#clf.best_estimator_.
#clf.best_estimator_.grid_scores_
#clf.best_estimator_.ranking_

0.9618995030369961
[[-0.84436265 -0.45332509 -2.51380408 -2.71758709  3.21122436 -0.55890961
   0.53585041 -2.06337741]]
[-3.07329454]
Index(['on_thyroxine', 'query_on_thyroxine', 'on_antithyroid_medication',
       'I131_treatment'],
      dtype='object')


Построена довольно точная моель логистической регрессии, по которой наиболее важные характеристики - это
'on_thyroxine', 'query_on_thyroxine', 'on_antithyroid_medication', 'I131_treatment'

In [12]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE

k_range = list(range(1, 10))
param_grid_knn = {
    'knn__n_neighbors': k_range
}

knn = KNeighborsClassifier()
pipe_knn = Pipeline(steps=[('preprocessor', preprocessor)
                       , ('knn', knn)])
#param_grid = {
#    'logistic__penalty': penalty
#    , 'logistic__C': C
#}

clf_knn = GridSearchCV(pipe_knn, param_grid_knn, cv=5, verbose=0, scoring='accuracy')
clf_knn.fit(X, y)
#clf = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy', verbose=0)
#clf.fit(X, y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('cont',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('scaler',
                                                                                          StandardScaler(copy=True,
                                                                                                         with_mean=True,
                       

In [16]:
best = clf_knn.best_score_
print(best)

0.9607951408061844


In [17]:
important_cols = []
for c in list(X.columns[idx]):
    X_ = X.copy()
    del X_[c]
    clf_knn.fit(X_, y)
    if (clf_knn.best_score_ < best):
        important_cols.append(c)
    else:
        print("Del " + c + ": " + str(clf_knn.best_score_))
    
    
print(important_cols)
print(list(X.columns[idx]))

Del on_thyroxine: 0.9607951408061844
Del query_on_thyroxine: 0.9607951408061844
Del on_antithyroid_medication: 0.9607951408061844
Del I131_treatment: 0.9607951408061844
[]
['on_thyroxine', 'query_on_thyroxine', 'on_antithyroid_medication', 'I131_treatment']


In [60]:
X_ = X.copy()
for c in list(X.columns[idx]):
    del X_[c]
    
clf_knn.fit(X_, y)
print(clf_knn.best_score_)

0.9607951408061844


Модель knn дает результат, на одну тысячную хуже, чем логистическая регрессия по метрике accuracy. Причем характеристики с наибольшим весов для логистической регрессии не влияют на результат knn 