## Форматирование и обогащение данных
### Форматирование, удобное для изучения и машинного обучения

In [47]:
import numpy  as np
import pandas as pd

In [48]:
# Настройка отображения данных в Jupyter notebook
pd.set_option('display.max_columns', 70)
pd.set_option('display.max_rows', 100)
pd.set_option('precision', 3)

In [49]:
# Visualization and Graphics
%pylab inline
%matplotlib inline
import matplotlib.pyplot as plt
# !conda install seaborn 
import seaborn as sns
plt.rcParams['figure.figsize'] = (7,7)   # (8,6)

#!pip install ggplot
matplotlib.style.use('ggplot')       # Use ggplot style plots

Populating the interactive namespace from numpy and matplotlib


## Обработка данных
### Первичная очистка
Первичная очистка и форматирование данных сделаны с помощью preprocess_data.R:
<br> [`preprocess_data.R`](https://github.com/AlexanderArtemyev/Wilson-disease/blob/master/preprocess_data.R) в кодировке cp1251. 
<br> [`preprocess_data.utf8.R`](https://github.com/AlexanderArtemyev/Wilson-disease/blob/master/preprocess_data.utf8.R) в кодировке utf-8.

### Чтение очищеных данных

In [50]:
data_csv_dir = '../data_csv/'
data_dir = '../data_csv/'

df = pd.read_csv(data_csv_dir + 'Wilson_anonym.csv', sep=';', encoding='cp1251')
df.head(5).tail(3)

Unnamed: 0,Target,FamilyID,TargetRelativeMax,Sex,Height,Mass,BMI,DebutAge,Cirrhosis,ChildPugh,Advanced,Activity,KKF,F2,F5,F7,F13,ITGA2,ITGB3,PAI_1,FGB,MTHFR_677,MTHFR_1298,DebutLiver,DebutNeuro,DebutKidney,DebutEndocr,DebutSibs,DebutVasku,DebutGemAnem,DebutSelez,DebutOther,TargetHead
2,1,3,,1,1.8,70.0,21.605,14,2,2,2,1,1.0,GG,GG,GA,GG,CC,TT,4G4G,GG,CT,AC,1,0,0,0,0,0,0,0,0,0
3,3,4,1.0,2,1.66,50.0,18.145,23,2,2,2,1,1.0,GG,GG,GG,GT,CT,TC,5G4G,GA,CT,AA,1,1,0,0,0,0,0,0,0,1
4,1,4,3.0,1,1.78,69.0,21.778,9,1,0,0,1,0.0,GG,GG,GG,GT,TT,TC,5G4G,GA,CC,AC,1,0,0,0,1,0,0,0,0,0


### Группировка и комбинирование признаков
#### Группы признаков
Группы выделены в соответствии со смыслом данных, <br> логикой их использования и обработки, <br> форматом данных:
- `target`
- `relatives`
- `sex`, `sex_cat`
- `bmi`, `bmi_scaled`
- `symptom`
- `cirrhosis`
    - `childpugh_dummy`
- `activity`
- `debut_age`, `debut_age_scaled`
- `debut_organ`
- `genetic`
- `genetic_dummy`
- `genetic__1`, `genetic__2`
- `genetic_risk__1`, `genetic_risk__2`
- `genetic_risk__1_scaled`, `genetic_risk__2_scaled`
- `data`
- `exclude`, `exclude_model`

<br> Вспомогательные признаки
- `num_to_scale`
- `num_scaled`
- `genetic_risk`
- `genetic_risk_scaled`

In [51]:
target        = ['TargetHead']                                               # TargetHead = ifelse(Target == 3, 1, 0)
relatives     = ['TargetHeadRelativeMax']                                    # Наихудший диагноз у родсттвенников: 0, 1, NaN
sex           = ['Sex']                                                      # Пол
bmi           = ['BMI', 'Height', 'Mass']                                    # Идекс массы тела, Рост, Вес
symptom       = ['KKF']                                                      # Симптом: Кольца Кайзера — Флейшера
cirrhosis     = ['Cirrhosis', 'ChildPugh', 'Advanced']                       # Цирроз: cтадия
activity      = ['Activity']                                                 # Скорость прогрессирования (активность) цирроза
debut_age     = ['DebutAge']                                                 # Дебют заболвания: возраст
debut_organ   = ['DebutLiver', 'DebutNeuro', 'DebutKidney', 'DebutEndocr',   # Дебют заболвания: что выявили
                          'DebutSibs', 'DebutVasku', 'DebutGemAnem', 'DebutSelez', 'DebutOther']
genetic       = ['F2', 'F5', 'F7', 'F13', 'ITGA2', 'ITGB3',                  # Генетические признаки 
                          'PAI_1', 'FGB', 'MTHFR_677', 'MTHFR_1298']
exclude       = ['Target', 'TargetRelativeMax']                              # Исключить из данных
exclude_model = ['FamilyID', 'DebutNeuro'] + exclude                         # Исключить из модели

In [52]:
features = [
    ('target',          target),
    ('relatives',       relatives),
    ('sex',             sex),
    ('bmi',             bmi),
    ('symptom',         symptom),
    ('cirrhosis',       cirrhosis),
    ('activity',        activity),
    ('debut_age',       debut_age),
    ('debut_organ',     debut_organ),
    ('genetic',         genetic),
    ('exclude',         exclude),
    ('exclude_model',   exclude_model)
]

#### Комбинирование групп признаков: объединение и исключение

In [53]:
def combine_features(combined_features_list,
                     exclude_features):
    '''combined_features_list - объединить эти группы признаков
       exclude_features  - исключить эти признаки
    '''
    result = list()
    for sublist in combined_features_list:
        for item in sublist:
            result.append(item)
    result = [x for x in result if x not in exclude_features]
    return(result)

### Приведение данных к виду, удобному для выбранного алгоритма машинного обучения или для визуализации
1. Преобразование и масштабирование числовых признаков
2. Формат пола - числовой и категориальный: М, F
3. Обогащение, группировка, масштабирование генетических признаков

#### Преобразование и масштабирование числовых признаков

In [54]:
df['TargetHeadRelativeMax'] = np.nan                                  # Нет данных о родственниках
df.loc[(df['TargetRelativeMax'] == 1, 'TargetHeadRelativeMax')] = 0   # Нет осложнений у родственников
df.loc[(df['TargetRelativeMax'] == 3, 'TargetHeadRelativeMax')] = 1   # Есть осложнения у родственников
#df.loc[list(df['TargetRelativeMax'] != 3)]

In [55]:
df['BMI_scaled']       = (df['BMI'].copy() - 23) / 4
df['Height_scaled']    = (df['Height'].copy() - 1.75) / 0.1
df['Mass_scaled']      = (df['Mass'].copy() - 70) / 15
df['DebutAge_scaled']  = (df['DebutAge'].copy() - 18) / 8
df['ChildPugh_scaled'] = (df['ChildPugh'] - 1.43) / 1.

#### Группы признаков
При выборе признаков хотим объединять их в группы

In [56]:
num_to_scale     = ['BMI','Height','Mass','DebutAge','ChildPugh']
bmi_scaled       = ['BMI_scaled','Height_scaled','Mass_scaled']
debut_age_scaled = ['DebutAge_scaled']
num_scaled       = bmi_scaled + debut_age_scaled + ['ChildPugh_scaled']
#num_scaled       = ['BMI_scaled','Height_scaled','Mass_scaled','DebutAge_scaled']

In [57]:
features = features + [('num_to_scale',     num_to_scale)]
features = features + [('num_scaled',       num_scaled)]
features = features + [('bmi_scaled',       bmi_scaled)]
features = features + [('debut_age_scaled', debut_age_scaled)]

In [58]:
print( df[num_to_scale].describe() )
print( df[num_scaled].describe() )

          BMI  Height     Mass  DebutAge  ChildPugh
count  83.000  81.000   81.000    84.000     84.000
mean   22.898   1.748   70.191    18.000      1.452
std     4.196   0.096   14.952     8.329      0.999
min    16.872   1.580   45.000     2.000      0.000
25%    20.049   1.680   60.000    13.000      1.000
50%    22.052   1.730   69.000    17.000      1.000
75%    24.914   1.800   77.000    22.000      2.000
max    36.332   2.000  120.000    46.000      3.000
       BMI_scaled  Height_scaled  Mass_scaled  DebutAge_scaled  \
count      83.000         81.000       81.000           84.000   
mean       -0.025         -0.016        0.013            0.000   
std         1.049          0.957        0.997            1.041   
min        -1.532         -1.700       -1.667           -2.000   
25%        -0.738         -0.700       -0.667           -0.625   
50%        -0.237         -0.200       -0.067           -0.125   
75%         0.479          0.500        0.467            0.500   
max 

#### Формат пола: M, F

In [59]:
def convert_sex(x):
    if x == 1:
        return 'M'
    else:
        return 'F'

df['Sex M/F'] = df.Sex.apply(func=convert_sex)

In [60]:
sex_cat = ['Sex M/F']

features = features + [('sex_cat', sex_cat)]

### Формирование таблицы `df_ext` для feature engineering

In [61]:
data_features_ext = combine_features(
                       [target, relatives, 
                        sex, sex_cat, 
                        bmi, bmi_scaled,
                        symptom, 
                        cirrhosis, activity,
                        debut_age, debut_age_scaled,
                        debut_organ,
                        genetic ],
                       exclude)

print(data_features_ext)

df_ext = df[data_features_ext]
df_ext.head(4)

['TargetHead', 'TargetHeadRelativeMax', 'Sex', 'Sex M/F', 'BMI', 'Height', 'Mass', 'BMI_scaled', 'Height_scaled', 'Mass_scaled', 'KKF', 'Cirrhosis', 'ChildPugh', 'Advanced', 'Activity', 'DebutAge', 'DebutAge_scaled', 'DebutLiver', 'DebutNeuro', 'DebutKidney', 'DebutEndocr', 'DebutSibs', 'DebutVasku', 'DebutGemAnem', 'DebutSelez', 'DebutOther', 'F2', 'F5', 'F7', 'F13', 'ITGA2', 'ITGB3', 'PAI_1', 'FGB', 'MTHFR_677', 'MTHFR_1298']


Unnamed: 0,TargetHead,TargetHeadRelativeMax,Sex,Sex M/F,BMI,Height,Mass,BMI_scaled,Height_scaled,Mass_scaled,KKF,Cirrhosis,ChildPugh,Advanced,Activity,DebutAge,DebutAge_scaled,DebutLiver,DebutNeuro,DebutKidney,DebutEndocr,DebutSibs,DebutVasku,DebutGemAnem,DebutSelez,DebutOther,F2,F5,F7,F13,ITGA2,ITGB3,PAI_1,FGB,MTHFR_677,MTHFR_1298
0,0,,1,M,24.22,1.84,82.0,0.305,0.9,0.8,0.0,2,1,1,1,25,0.875,1,0,0,0,0,0,0,0,1,GG,GG,GA,GG,CT,TT,5G5G,GG,TT,AA
1,0,,2,F,19.493,1.71,57.0,-0.877,-0.4,-0.867,1.0,2,3,2,1,19,0.125,0,0,0,0,0,1,0,0,0,GG,GG,GG,GG,CC,TT,5G4G,GA,CC,AC
2,0,,1,M,21.605,1.8,70.0,-0.349,0.5,0.0,1.0,2,2,2,1,14,-0.5,1,0,0,0,0,0,0,0,0,GG,GG,GA,GG,CC,TT,4G4G,GG,CT,AC
3,1,0.0,2,F,18.145,1.66,50.0,-1.214,-0.9,-1.333,1.0,2,2,2,1,23,0.625,1,1,0,0,0,0,0,0,0,GG,GG,GG,GT,CT,TC,5G4G,GA,CT,AA


### Группа `cirrhosis`
#### Dummy encoding для 'ChildPugh' 

In [62]:
df_dummies = pd.get_dummies(df[['ChildPugh']], prefix=None,  prefix_sep='_', dummy_na=False, 
                       columns=['ChildPugh'],  sparse=False, drop_first=False)

df_ext = pd.concat([df_ext, df_dummies], axis = 1)   # https://pandas.pydata.org/pandas-docs/stable/merging.html

childpugh_dummy = list( df_dummies.columns )

features = features + [('childpugh_dummy', childpugh_dummy)]

print(df_dummies.head(3))

df_ext.head(3)

   ChildPugh_0  ChildPugh_1  ChildPugh_2  ChildPugh_3
0            0            1            0            0
1            0            0            0            1
2            0            0            1            0


Unnamed: 0,TargetHead,TargetHeadRelativeMax,Sex,Sex M/F,BMI,Height,Mass,BMI_scaled,Height_scaled,Mass_scaled,KKF,Cirrhosis,ChildPugh,Advanced,Activity,DebutAge,DebutAge_scaled,DebutLiver,DebutNeuro,DebutKidney,DebutEndocr,DebutSibs,DebutVasku,DebutGemAnem,DebutSelez,DebutOther,F2,F5,F7,F13,ITGA2,ITGB3,PAI_1,FGB,MTHFR_677,MTHFR_1298,ChildPugh_0,ChildPugh_1,ChildPugh_2,ChildPugh_3
0,0,,1,M,24.22,1.84,82.0,0.305,0.9,0.8,0.0,2,1,1,1,25,0.875,1,0,0,0,0,0,0,0,1,GG,GG,GA,GG,CT,TT,5G5G,GG,TT,AA,0,1,0,0
1,0,,2,F,19.493,1.71,57.0,-0.877,-0.4,-0.867,1.0,2,3,2,1,19,0.125,0,0,0,0,0,1,0,0,0,GG,GG,GG,GG,CC,TT,5G4G,GA,CC,AC,0,0,0,1
2,0,,1,M,21.605,1.8,70.0,-0.349,0.5,0.0,1.0,2,2,2,1,14,-0.5,1,0,0,0,0,0,0,0,0,GG,GG,GA,GG,CC,TT,4G4G,GG,CT,AC,0,0,1,0


### Генетические признаки
#### Обогащение, группировка, масштабирование генетических признаков

##### Dummy encoding для генетических признаков.

In [63]:
df_dummies = pd.get_dummies(df[genetic], prefix=None, prefix_sep='_', dummy_na=False, 
                       columns=genetic, sparse=False, drop_first=False)

df_ext = pd.concat([df_ext, df_dummies], axis = 1)   # https://pandas.pydata.org/pandas-docs/stable/merging.html
df_dummies.head(3)

Unnamed: 0,F2_GA,F2_GG,F5_GA,F5_GG,F7_GA,F7_GG,F13_GG,F13_GT,F13_TT,ITGA2_CC,ITGA2_CT,ITGA2_TT,ITGB3_TC,ITGB3_TT,PAI_1_4G4G,PAI_1_5G4G,PAI_1_5G5G,FGB_AA,FGB_GA,FGB_GG,MTHFR_677_CC,MTHFR_677_CT,MTHFR_677_TT,MTHFR_1298_AA,MTHFR_1298_AC,MTHFR_1298_CC
0,0,1,0,1,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,1,0,0
1,0,1,0,1,0,1,1,0,0,1,0,0,0,1,0,1,0,0,1,0,1,0,0,0,1,0
2,0,1,0,1,1,0,1,0,0,1,0,0,0,1,1,0,0,0,0,1,0,1,0,0,1,0


In [64]:
genetic_dummy = list( df_dummies.columns )

print(genetic)
print(genetic_dummy)

['F2', 'F5', 'F7', 'F13', 'ITGA2', 'ITGB3', 'PAI_1', 'FGB', 'MTHFR_677', 'MTHFR_1298']
['F2_GA', 'F2_GG', 'F5_GA', 'F5_GG', 'F7_GA', 'F7_GG', 'F13_GG', 'F13_GT', 'F13_TT', 'ITGA2_CC', 'ITGA2_CT', 'ITGA2_TT', 'ITGB3_TC', 'ITGB3_TT', 'PAI_1_4G4G', 'PAI_1_5G4G', 'PAI_1_5G5G', 'FGB_AA', 'FGB_GA', 'FGB_GG', 'MTHFR_677_CC', 'MTHFR_677_CT', 'MTHFR_677_TT', 'MTHFR_1298_AA', 'MTHFR_1298_AC', 'MTHFR_1298_CC']


In [65]:
features = features + [('genetic_dummy', genetic_dummy)]

##### Формальное отображение каждого из генетических признаков в массив [0,1,2]
Наиболее частый гомозинготный аллельный вариант: 0,
<br> гетерозиготный аллельный вариант: 1,
<br> редкий гомозинготный аллельный вариант: 2.

In [66]:
gen_map_1 = [
          {"F2":         {"GG":0,   "GA":1,     "AA":2}},
          {"F5":         {"GG":0,   "GA":1,     "AA":2}},
          {"F7":         {"GG":0,   "GA":1,     "AA":2}},
          {"F13":        {"GG":0,   "GT":1,     "TT":2}},
          {"ITGA2":      {"CC":0,   "CT":1,     "TT":2}},
          {"ITGB3":      {"TT":0,   "TC":1,     "CC":2}},
          {"PAI_1":      {"5G5G":0, "5G4G":1, "4G4G":2}},
          {"FGB":        {"GG":0,   "GA":1,     "AA":2}},
          {"MTHFR_677":  {"CC":0,   "CT":1,     "TT":2}},
          {"MTHFR_1298": {"AA":0,   "AC":1,     "CC":2}}

    ]

df_1 = pd.DataFrame()
for m in gen_map_1:    
    [(k,v)] = m.items()
    #print(k,v)
    df_1[k + '__1'] = df[k].map(v)

df_ext = pd.concat([df_ext, df_1], axis = 1)   # https://pandas.pydata.org/pandas-docs/stable/merging.html

df_1.head(3)

Unnamed: 0,F2__1,F5__1,F7__1,F13__1,ITGA2__1,ITGB3__1,PAI_1__1,FGB__1,MTHFR_677__1,MTHFR_1298__1
0,0,0,1,0,1,0,0,0,2,0
1,0,0,0,0,0,0,1,1,0,1
2,0,0,1,0,0,0,2,0,1,1


In [67]:
genetic__1 = list( df_1.columns )
print(genetic__1)

features = features + [('genetic__1',     genetic__1)]

['F2__1', 'F5__1', 'F7__1', 'F13__1', 'ITGA2__1', 'ITGB3__1', 'PAI_1__1', 'FGB__1', 'MTHFR_677__1', 'MTHFR_1298__1']


##### Отображение каждого из генетических признаков  в массив [0,1,2] <br> с помощью открытых баз знаний медицинских лабораторий

Использованы открытые базы знаний лабораторий: helix.ru, invitro.ru
<br> В базах знаний описана связь гомозиготной комбинации нуклеотидов с увеличением или снижением риска тромбофилии:

Указано, что **гомозиготная** комбинация влияет на риск тромбофилии: $\pm 2$. 
<br> Указано, что **гетерозиготная** комбинация влияет на риск тромбофилии: $\pm 1$. Это не всегда понятно из описания.
<br> Аллельные варианты нуклеотидов (гетерозиготный, гомозиготный) **не связанные** с риском тромбофилии: $0$. 


In [68]:
# Использование базы знаний в соответствии с описанием
gen_map_2 = [
          {"F2":          {"GG":0,   "GA":1,   "AA":2}},
          {"F5":          {"GG":0,   "GA":1,   "AA":2}},
          {"F7":          {"GG":0,   "GA":-1,  "AA":-2}},
          {"F13":         {"GG":0,   "GT":0,   "TT":-2}},
          {"ITGA2":       {"CC":0,   "CT":0,   "TT":2}},
          {"ITGB3":       {"TT":0,   "TC":1,   "CC":2}},
          {"PAI_1":       {"5G5G":0, "5G4G":0, "4G4G":2}},
          {"FGB":         {"GG":0,   "GA":1,   "AA":2}},
          {"MTHFR_677":   {"CC":0,   "CT":0,   "TT":2}},
          {"MTHFR_1298":  {"AA":0,   "AC":1,   "CC":2}}
        ]

df_2 = pd.DataFrame()
for m in gen_map_2:    
    [(k,v)] = m.items()
    df_2[k + '__2'] = df[k].map(v)

df_ext = pd.concat([df_ext, df_2], axis = 1)   # https://pandas.pydata.org/pandas-docs/stable/merging.html

df_2.head(3)

Unnamed: 0,F2__2,F5__2,F7__2,F13__2,ITGA2__2,ITGB3__2,PAI_1__2,FGB__2,MTHFR_677__2,MTHFR_1298__2
0,0,0,-1,0,0,0,0,0,2,0
1,0,0,0,0,0,0,0,1,0,1
2,0,0,-1,0,0,0,2,0,0,1


In [69]:
genetic__2 = list( df_2.columns )
print(genetic__2)

features = features + [('genetic__2', genetic__2)]

['F2__2', 'F5__2', 'F7__2', 'F13__2', 'ITGA2__2', 'ITGB3__2', 'PAI_1__2', 'FGB__2', 'MTHFR_677__2', 'MTHFR_1298__2']


#### Комбинации генетических признаков.

Группировка известных факторов либо увеличения, либо снижения риска тромбофилии - согласно открытым базам знаний лабораторий.
<br> Суммы по группам. Масштабирование вклада групп. 

In [70]:
# Группы генетических факторов: увеличение или снижение риска тробофилии
genetic_guess = [
    {'GenRisk__1':['F2__1', 'F5__1', 'ITGA2__1', 'ITGB3__1', 'PAI_1__1', 'FGB__1', 'MTHFR_677__1', 'MTHFR_1298__1']},
    {'GenRisk__2':['F2__2', 'F5__2', 'ITGA2__2', 'ITGB3__2', 'PAI_1__2', 'FGB__2', 'MTHFR_677__2', 'MTHFR_1298__2']},
    {'GenProtect__1':['F7__1', 'F13__1']},
    {'GenProtect__2':['F7__2', 'F13__2']} ]

# Суммы по группам
df_gg = pd.DataFrame()
for g in genetic_guess:
    #print(g)       #print( g.items() )         #print( k,v )
    [(k,v)] = g.items()
    df_gg[k] = np.array(df_ext[v].sum(axis=1))

df_ext = pd.concat([df_ext, df_gg], axis=1)   # https://pandas.pydata.org/pandas-docs/stable/merging.html

In [71]:
# Сгруппированные генетические признаки для использования в модели
genetic_risk__1 = ['GenRisk__1', 'GenProtect__1']
genetic_risk__2 = ['GenRisk__2', 'GenProtect__2']
genetic_risk    = ['GenRisk__1', 'GenRisk__2', 'GenProtect__1', 'GenProtect__2']

features = features + [('genetic_risk__1', genetic_risk__1)]
features = features + [('genetic_risk__2', genetic_risk__2)]
features = features + [('genetic_risk',    genetic_risk)]

In [72]:
df_ext[genetic_risk].head(3)

Unnamed: 0,GenRisk__1,GenRisk__2,GenProtect__1,GenProtect__2
0,3,2,1,-1
1,3,2,0,0
2,4,3,1,-1


#### Масштабируем генетические признаки

In [73]:
print( df_ext[genetic_risk].describe() )

       GenRisk__1  GenRisk__2  GenProtect__1  GenProtect__2
count      84.000      84.000         84.000         84.000
mean        4.226       2.940          0.810         -0.405
std         1.547       1.852          0.736          0.679
min         2.000       0.000          0.000         -3.000
25%         3.000       2.000          0.000         -1.000
50%         4.000       3.000          1.000          0.000
75%         5.000       4.000          1.000          0.000
max         8.000       7.000          3.000          0.000


In [74]:
df_ext['GenRisk__1_scaled']    = (df_ext['GenRisk__1'].copy()    - 4.23)  / 1.55
df_ext['GenProtect__1_scaled'] = (df_ext['GenProtect__1'].copy() - 0.81)  / 0.74

df_ext['GenRisk__2_scaled']    = (df_ext['GenRisk__2'].copy()    - 2.94)  / 1.85
df_ext['GenProtect__2_scaled'] = (df_ext['GenProtect__2'].copy() + 0.405) / 0.68

In [75]:
genetic_risk__1_scaled = ['GenRisk__1_scaled', 'GenProtect__1_scaled']
genetic_risk__2_scaled = ['GenRisk__2_scaled', 'GenProtect__2_scaled']
genetic_risk_scaled    = ['GenRisk__1_scaled',    'GenRisk__2_scaled', 
                          'GenProtect__1_scaled', 'GenProtect__2_scaled']

# Remove last 3 elements from the list
# features = features[:-3]
#[0,1,2,3,4,5,6,7,8,9][:-3]

features = features + [('genetic_risk__1_scaled', genetic_risk__1_scaled)]
features = features + [('genetic_risk__2_scaled', genetic_risk__2_scaled)]
features = features + [('genetic_risk_scaled',    genetic_risk_scaled)]

In [76]:
print( df_ext[genetic_risk_scaled].describe() )

       GenRisk__1_scaled  GenRisk__2_scaled  GenProtect__1_scaled  \
count             84.000          8.400e+01             8.400e+01   
mean              -0.002          2.574e-04            -6.435e-04   
std                0.998          1.001e+00             9.945e-01   
min               -1.439         -1.589e+00            -1.095e+00   
25%               -0.794         -5.081e-01            -1.095e+00   
50%               -0.148          3.243e-02             2.568e-01   
75%                0.497          5.730e-01             2.568e-01   
max                2.432          2.195e+00             2.959e+00   

       GenProtect__2_scaled  
count             8.400e+01  
mean              3.501e-04  
std               9.982e-01  
min              -3.816e+00  
25%              -8.750e-01  
50%               5.956e-01  
75%               5.956e-01  
max               5.956e-01  


### Сохраняем обработанные данные и группы колонок
#### Сохраняем данные

In [77]:
data_features_ext = combine_features(
                       [target, relatives, 
                        sex, sex_cat, 
                        bmi, bmi_scaled,
                        symptom, 
                        cirrhosis, activity, childpugh_dummy,
                        debut_age, debut_age_scaled,
                        debut_organ,
                        genetic, genetic__1, genetic__2, 
                        genetic_dummy,
                        genetic_risk__1, genetic_risk__2, 
                        genetic_risk__1_scaled, genetic_risk__2_scaled
                       ],
                       exclude)

In [78]:
print(data_features_ext)

['TargetHead', 'TargetHeadRelativeMax', 'Sex', 'Sex M/F', 'BMI', 'Height', 'Mass', 'BMI_scaled', 'Height_scaled', 'Mass_scaled', 'KKF', 'Cirrhosis', 'ChildPugh', 'Advanced', 'Activity', 'ChildPugh_0', 'ChildPugh_1', 'ChildPugh_2', 'ChildPugh_3', 'DebutAge', 'DebutAge_scaled', 'DebutLiver', 'DebutNeuro', 'DebutKidney', 'DebutEndocr', 'DebutSibs', 'DebutVasku', 'DebutGemAnem', 'DebutSelez', 'DebutOther', 'F2', 'F5', 'F7', 'F13', 'ITGA2', 'ITGB3', 'PAI_1', 'FGB', 'MTHFR_677', 'MTHFR_1298', 'F2__1', 'F5__1', 'F7__1', 'F13__1', 'ITGA2__1', 'ITGB3__1', 'PAI_1__1', 'FGB__1', 'MTHFR_677__1', 'MTHFR_1298__1', 'F2__2', 'F5__2', 'F7__2', 'F13__2', 'ITGA2__2', 'ITGB3__2', 'PAI_1__2', 'FGB__2', 'MTHFR_677__2', 'MTHFR_1298__2', 'F2_GA', 'F2_GG', 'F5_GA', 'F5_GG', 'F7_GA', 'F7_GG', 'F13_GG', 'F13_GT', 'F13_TT', 'ITGA2_CC', 'ITGA2_CT', 'ITGA2_TT', 'ITGB3_TC', 'ITGB3_TT', 'PAI_1_4G4G', 'PAI_1_5G4G', 'PAI_1_5G5G', 'FGB_AA', 'FGB_GA', 'FGB_GG', 'MTHFR_677_CC', 'MTHFR_677_CT', 'MTHFR_677_TT', 'MTHFR_1298_

#### Запись данных в файл: `pd.to_cvs`
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html
<br> https://stackoverflow.com/questions/27370046/closing-file-after-using-to-csv

In [79]:
data_dir

'../data_csv/'

In [80]:
outfile = open(data_dir + 'Wilson_ext.csv', 'w')
df_ext[data_features_ext].to_csv(outfile, sep=';', index=False, encoding='utf-8', chunksize=1)
outfile.close()

#### Чтение данных из сохранённого файла

In [81]:
df_ext2 = pd.read_csv(data_dir + 'Wilson_ext.csv', sep=';', encoding='utf-8') 
print(df_ext2.columns)

Index(['TargetHead', 'TargetHeadRelativeMax', 'Sex', 'Sex M/F', 'BMI',
       'Height', 'Mass', 'BMI_scaled', 'Height_scaled', 'Mass_scaled', 'KKF',
       'Cirrhosis', 'ChildPugh', 'Advanced', 'Activity', 'ChildPugh_0',
       'ChildPugh_1', 'ChildPugh_2', 'ChildPugh_3', 'DebutAge',
       'DebutAge_scaled', 'DebutLiver', 'DebutNeuro', 'DebutKidney',
       'DebutEndocr', 'DebutSibs', 'DebutVasku', 'DebutGemAnem', 'DebutSelez',
       'DebutOther', 'F2', 'F5', 'F7', 'F13', 'ITGA2', 'ITGB3', 'PAI_1', 'FGB',
       'MTHFR_677', 'MTHFR_1298', 'F2__1', 'F5__1', 'F7__1', 'F13__1',
       'ITGA2__1', 'ITGB3__1', 'PAI_1__1', 'FGB__1', 'MTHFR_677__1',
       'MTHFR_1298__1', 'F2__2', 'F5__2', 'F7__2', 'F13__2', 'ITGA2__2',
       'ITGB3__2', 'PAI_1__2', 'FGB__2', 'MTHFR_677__2', 'MTHFR_1298__2',
       'F2_GA', 'F2_GG', 'F5_GA', 'F5_GG', 'F7_GA', 'F7_GG', 'F13_GG',
       'F13_GT', 'F13_TT', 'ITGA2_CC', 'ITGA2_CT', 'ITGA2_TT', 'ITGB3_TC',
       'ITGB3_TT', 'PAI_1_4G4G', 'PAI_1_5G4G', 'PAI_1_

#### Сохраняем группы колонок
Сохраняем в формате `json`. Библиотека `simplejson`. 

Адаптировал к Python 3 утилиты `data_to_json` и `json_to_data`  из статьи [Serializing Python Data To Json - Some Edge Cases](http://robotfantastic.org/serializing-python-data-to-json-some-edge-cases.html) by Chris Wagner.

In [82]:
# !conda install simplejson
%run data_to_json

In [83]:
from collections import OrderedDict

In [84]:
json_to_data(data_to_json(OrderedDict(features)))['bmi']  # ['target']

['BMI', 'Height', 'Mass']

In [85]:
features_file = open(data_dir + 'features_file.json', 'w')
with features_file:
    features_file.write( data_to_json(OrderedDict(features)) )
    # json.dump(json_format, features_file)
features_file.close()

In [86]:
# json_format = json.dumps(features)
# json.loads(json_format, object_pairs_hook=OrderedDict)

##### Прочтём и проверим, что получилось

In [87]:
features_file = open(data_dir + 'features_file.json', 'r')
features_loaded = json_to_data( features_file.read() )
#features_loaded = json.load(features_file, object_pairs_hook=OrderedDict)
features_file.close()

print(features_loaded)
type(features_loaded)

OrderedDict([('target', ['TargetHead']), ('relatives', ['TargetHeadRelativeMax']), ('sex', ['Sex']), ('bmi', ['BMI', 'Height', 'Mass']), ('symptom', ['KKF']), ('cirrhosis', ['Cirrhosis', 'ChildPugh', 'Advanced']), ('activity', ['Activity']), ('debut_age', ['DebutAge']), ('debut_organ', ['DebutLiver', 'DebutNeuro', 'DebutKidney', 'DebutEndocr', 'DebutSibs', 'DebutVasku', 'DebutGemAnem', 'DebutSelez', 'DebutOther']), ('genetic', ['F2', 'F5', 'F7', 'F13', 'ITGA2', 'ITGB3', 'PAI_1', 'FGB', 'MTHFR_677', 'MTHFR_1298']), ('exclude', ['Target', 'TargetRelativeMax']), ('exclude_model', ['FamilyID', 'DebutNeuro', 'Target', 'TargetRelativeMax']), ('num_to_scale', ['BMI', 'Height', 'Mass', 'DebutAge', 'ChildPugh']), ('num_scaled', ['BMI_scaled', 'Height_scaled', 'Mass_scaled', 'DebutAge_scaled', 'ChildPugh_scaled']), ('bmi_scaled', ['BMI_scaled', 'Height_scaled', 'Mass_scaled']), ('debut_age_scaled', ['DebutAge_scaled']), ('sex_cat', ['Sex M/F']), ('childpugh_dummy', ['ChildPugh_0', 'ChildPugh_1',

collections.OrderedDict

In [88]:
features_loaded["target"]

['TargetHead']

In [89]:
for (k,v) in features_loaded.items():
    print(k + ": ", v)

target:  ['TargetHead']
relatives:  ['TargetHeadRelativeMax']
sex:  ['Sex']
bmi:  ['BMI', 'Height', 'Mass']
symptom:  ['KKF']
cirrhosis:  ['Cirrhosis', 'ChildPugh', 'Advanced']
activity:  ['Activity']
debut_age:  ['DebutAge']
debut_organ:  ['DebutLiver', 'DebutNeuro', 'DebutKidney', 'DebutEndocr', 'DebutSibs', 'DebutVasku', 'DebutGemAnem', 'DebutSelez', 'DebutOther']
genetic:  ['F2', 'F5', 'F7', 'F13', 'ITGA2', 'ITGB3', 'PAI_1', 'FGB', 'MTHFR_677', 'MTHFR_1298']
exclude:  ['Target', 'TargetRelativeMax']
exclude_model:  ['FamilyID', 'DebutNeuro', 'Target', 'TargetRelativeMax']
num_to_scale:  ['BMI', 'Height', 'Mass', 'DebutAge', 'ChildPugh']
num_scaled:  ['BMI_scaled', 'Height_scaled', 'Mass_scaled', 'DebutAge_scaled', 'ChildPugh_scaled']
bmi_scaled:  ['BMI_scaled', 'Height_scaled', 'Mass_scaled']
debut_age_scaled:  ['DebutAge_scaled']
sex_cat:  ['Sex M/F']
childpugh_dummy:  ['ChildPugh_0', 'ChildPugh_1', 'ChildPugh_2', 'ChildPugh_3']
genetic_dummy:  ['F2_GA', 'F2_GG', 'F5_GA', 'F5_GG',