## Форматирование и обогащение данных
### Форматирование, удобное для изучения и машинного обучения
#### Загрузка библиотек

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Настройка отображения данных в Jupyter notebook
pd.set_option('display.max_columns', 70)
pd.set_option('display.max_rows', 100)
pd.set_option('precision', 3)

In [3]:
# Visualization and Graphics
%pylab inline
%matplotlib inline
import matplotlib.pyplot as plt
# !conda install seaborn 
import seaborn as sns
plt.rcParams['figure.figsize'] = (7,7)   # (8,6)

#!pip install ggplot
matplotlib.style.use('ggplot')       # Use ggplot style plots

Populating the interactive namespace from numpy and matplotlib


## Обработка данных
### Первичная очистка
Первичная очистка и форматирование данных сделаны с помощью preprocess_data.R:
<br> [`preprocess_data.R`](https://github.com/AlexanderArtemyev/Wilson-disease/blob/master/preprocess_data.R) в кодировке cp1251. 
<br> [`preprocess_data.utf8.R`](https://github.com/AlexanderArtemyev/Wilson-disease/blob/master/preprocess_data.utf8.R) в кодировке utf-8.

### Чтение очищеных данных

In [4]:
df = pd.read_csv('../Wilson_2/Wilson_anonym.csv', sep=';', encoding='cp1251')
df.head(5).tail(4)

Unnamed: 0,TargetHead,FamilyID,TargetHeadRelativeMax,Sex,Height,Mass,BMI,DebutAge,Cirrhosis,ChildPugh,Advanced,Activity,KKF,F2,F5,F7,F13,ITGA2,ITGB3,PAI_1,FGB,MTHFR_677,MTHFR_1298,DebutLiver,DebutNeuro,DebutKidney,DebutEndocr,DebutSibs,DebutVasku,DebutGemAnem,DebutSelez,DebutOther
1,0,2,,2,1.71,57.0,19.493,19,2,3,2,1,1.0,GG,GG,GG,GG,CC,TT,5G4G,GA,CC,AC,0,0,0,0,0,1,0,0,0
2,0,3,,1,1.8,70.0,21.605,14,2,2,2,1,1.0,GG,GG,GA,GG,CC,TT,4G4G,GG,CT,AC,1,0,0,0,0,0,0,0,0
3,1,4,0.0,2,1.66,50.0,18.145,23,2,2,2,1,1.0,GG,GG,GG,GT,CT,TC,5G4G,GA,CT,AA,1,1,0,0,0,0,0,0,0
4,0,4,1.0,1,1.78,69.0,21.778,9,1,0,0,1,0.0,GG,GG,GG,GT,TT,TC,5G4G,GA,CC,AC,1,0,0,0,1,0,0,0,0


### Группировка и комбинирование признаков


#### Группы признаков
Группы выделены в соответствии со смыслом данных, форматом их представления, <br> планируемой логикой их обработки и использования.
- `target_features`, 
- `relatives_features`, 
- `sex_features`, `sex_features_cat`, 
- `bmi_features`, `bmi_features_scaled`,
- `symptom_features`, `cirrhosis_features`, 
- `debut_age_features`, `debut_age_features_scaled`,
- `debut_organ_features`,
- `genetic_features`, `genetic_features__1`, `genetic_features__2`, `genetic_features__dummy`, `genetic_risk_features`
- `exclude_features`

In [5]:
target_features        = ['TargetHead']                                                    # TargetHead = ifelse(Target == 3, 1, 0)
relatives_features     = ['TargetHeadRelativeMax']                                         # Наихудший диагноз у родсттвенников
sex_features           = ['Sex']                                                           # Пол
bmi_features           = ['BMI', 'Height', 'Mass']                                         # Идекс массы тела, Рост, Вес
symptom_features       = ['KKF']                                                           # Симптом: Кольца Кайзера — Флейшера
cirrhosis_features     = ['Cirrhosis', 'ChildPugh', 'Advanced', 'Activity']                # Цирроз: Стадия. Активность (динамика).
debut_age_features     = ['DebutAge']                                                      # Дебют заболвания: возраст
debut_organ_features   = ['DebutLiver', 'DebutNeuro', 'DebutKidney', 'DebutEndocr',        # Дебют заболвания: что выявили
                          'DebutSibs', 'DebutVasku', 'DebutGemAnem', 'DebutSelez', 'DebutOther']
genetic_features       = ['F2', 'F5', 'F7', 'F13', 'ITGA2', 'ITGB3',                       # Генетические признаки 
                          'PAI_1', 'FGB', 'MTHFR_677', 'MTHFR_1298']
exclude_features       = ['Target', 'TargetRelativeMax']                                   # Исключить из данных
exclude_model_features = ['FamilyID', 'DebutNeuro'] + exclude_features                     # Исключить из модели

#### Комбинирование признаков: объединение и исключение групп

In [6]:
def combine_features(all_feat_list, exclude_feat):
    combined_list = list()
    for sublist in all_feat_list:
        for item in sublist:
            combined_list.append(item)
    combined_list = [x for x in combined_list if x not in exclude_features]
    return(combined_list)

data_features = combine_features([ target_features, relatives_features, sex_features, bmi_features, symptom_features, 
                                   cirrhosis_features, debut_age_features, debut_organ_features, genetic_features ],
                                 exclude_features)

print(data_features)

['TargetHead', 'TargetHeadRelativeMax', 'Sex', 'BMI', 'Height', 'Mass', 'KKF', 'Cirrhosis', 'ChildPugh', 'Advanced', 'Activity', 'DebutAge', 'DebutLiver', 'DebutNeuro', 'DebutKidney', 'DebutEndocr', 'DebutSibs', 'DebutVasku', 'DebutGemAnem', 'DebutSelez', 'DebutOther', 'F2', 'F5', 'F7', 'F13', 'ITGA2', 'ITGB3', 'PAI_1', 'FGB', 'MTHFR_677', 'MTHFR_1298']


### Приведение данных к виду, удобному для изучения и машинного обучения
1. Масштабирование числовых признаков.
2. Формат пола.
3. Преобразование генетических признаков.

#### Масштабирование числовых признаков

In [7]:
num_features_to_scale     = ['BMI','Height','Mass','DebutAge']
num_features_scaled       = ['BMI_scaled','Height_scaled','Mass_scaled','DebutAge_scaled']
bmi_features_scaled       = ['BMI_scaled','Height_scaled','Mass_scaled']
debut_age_features_scaled = ['DebutAge_scaled']

In [8]:
df['BMI_scaled']      = (df['BMI'] - 23) / 4
df['Height_scaled']   = (df['Height'] - 1.75) / 0.1
df['Mass_scaled']     = (df['Mass'] - 70) / 15
df['DebutAge_scaled'] = (df['DebutAge'] - 18) / 8

In [9]:
print( df[num_features_to_scale].describe() )
print( df[num_features_scaled].describe() )

          BMI  Height     Mass  DebutAge
count  83.000  81.000   81.000    84.000
mean   22.898   1.748   70.191    18.000
std     4.196   0.096   14.952     8.329
min    16.872   1.580   45.000     2.000
25%    20.049   1.680   60.000    13.000
50%    22.052   1.730   69.000    17.000
75%    24.914   1.800   77.000    22.000
max    36.332   2.000  120.000    46.000
       BMI_scaled  Height_scaled  Mass_scaled  DebutAge_scaled
count      83.000         81.000       81.000           84.000
mean       -0.025         -0.016        0.013            0.000
std         1.049          0.957        0.997            1.041
min        -1.532         -1.700       -1.667           -2.000
25%        -0.738         -0.700       -0.667           -0.625
50%        -0.237         -0.200       -0.067           -0.125
75%         0.479          0.500        0.467            0.500
max         3.333          2.500        3.333            3.500


#### Формат пола: M, F

In [10]:
def convert_sex(x):
    if x == 1:
        return 'M'
    else:
        return 'F'

df['Sex M/F'] = df.Sex.apply(func=convert_sex)

sex_features_cat = ['Sex M/F']

#### Преобразование генетических признаков
##### Инициализация

In [11]:
data_features_ext = combine_features(
                       [target_features, relatives_features, 
                        sex_features, sex_features_cat, 
                        bmi_features, bmi_features_scaled,
                        symptom_features, 
                        cirrhosis_features, 
                        debut_age_features, debut_age_features_scaled,
                        debut_organ_features,
                        genetic_features ],
                       exclude_features)

df_ext = df[data_features_ext]
df_ext.head(4)

Unnamed: 0,TargetHead,TargetHeadRelativeMax,Sex,Sex M/F,BMI,Height,Mass,BMI_scaled,Height_scaled,Mass_scaled,KKF,Cirrhosis,ChildPugh,Advanced,Activity,DebutAge,DebutAge_scaled,DebutLiver,DebutNeuro,DebutKidney,DebutEndocr,DebutSibs,DebutVasku,DebutGemAnem,DebutSelez,DebutOther,F2,F5,F7,F13,ITGA2,ITGB3,PAI_1,FGB,MTHFR_677,MTHFR_1298
0,0,,1,M,24.22,1.84,82.0,0.305,0.9,0.8,0.0,2,1,1,1,25,0.875,1,0,0,0,0,0,0,0,1,GG,GG,GA,GG,CT,TT,5G5G,GG,TT,AA
1,0,,2,F,19.493,1.71,57.0,-0.877,-0.4,-0.867,1.0,2,3,2,1,19,0.125,0,0,0,0,0,1,0,0,0,GG,GG,GG,GG,CC,TT,5G4G,GA,CC,AC
2,0,,1,M,21.605,1.8,70.0,-0.349,0.5,0.0,1.0,2,2,2,1,14,-0.5,1,0,0,0,0,0,0,0,0,GG,GG,GA,GG,CC,TT,4G4G,GG,CT,AC
3,1,0.0,2,F,18.145,1.66,50.0,-1.214,-0.9,-1.333,1.0,2,2,2,1,23,0.625,1,1,0,0,0,0,0,0,0,GG,GG,GG,GT,CT,TC,5G4G,GA,CT,AA


##### Dummy encoding для генетических признаков.

In [12]:
df_dummies = pd.get_dummies(df[genetic_features], prefix=None, prefix_sep='_', dummy_na=False, 
                columns=genetic_features, sparse=False, drop_first=False)

genetic_features__dummy = list( df_dummies.columns )
print(genetic_features)
print(genetic_features__dummy)

df_ext = pd.concat([df_ext, df_dummies], axis = 1)   # https://pandas.pydata.org/pandas-docs/stable/merging.html

df_dummies.head()

['F2', 'F5', 'F7', 'F13', 'ITGA2', 'ITGB3', 'PAI_1', 'FGB', 'MTHFR_677', 'MTHFR_1298']
['F2_GA', 'F2_GG', 'F5_GA', 'F5_GG', 'F7_GA', 'F7_GG', 'F13_GG', 'F13_GT', 'F13_TT', 'ITGA2_CC', 'ITGA2_CT', 'ITGA2_TT', 'ITGB3_TC', 'ITGB3_TT', 'PAI_1_4G4G', 'PAI_1_5G4G', 'PAI_1_5G5G', 'FGB_AA', 'FGB_GA', 'FGB_GG', 'MTHFR_677_CC', 'MTHFR_677_CT', 'MTHFR_677_TT', 'MTHFR_1298_AA', 'MTHFR_1298_AC', 'MTHFR_1298_CC']


Unnamed: 0,F2_GA,F2_GG,F5_GA,F5_GG,F7_GA,F7_GG,F13_GG,F13_GT,F13_TT,ITGA2_CC,ITGA2_CT,ITGA2_TT,ITGB3_TC,ITGB3_TT,PAI_1_4G4G,PAI_1_5G4G,PAI_1_5G5G,FGB_AA,FGB_GA,FGB_GG,MTHFR_677_CC,MTHFR_677_CT,MTHFR_677_TT,MTHFR_1298_AA,MTHFR_1298_AC,MTHFR_1298_CC
0,0,1,0,1,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,1,0,0
1,0,1,0,1,0,1,1,0,0,1,0,0,0,1,0,1,0,0,1,0,1,0,0,0,1,0
2,0,1,0,1,1,0,1,0,0,1,0,0,0,1,1,0,0,0,0,1,0,1,0,0,1,0
3,0,1,0,1,0,1,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0
4,0,1,0,1,0,1,0,1,0,0,0,1,1,0,0,1,0,0,1,0,1,0,0,0,1,0


##### Формальное отображение каждого из генетических признаков в массив [0,1,2]

In [13]:
gen_map_1 = [
          {"F2":         {"GG":0, "GA":1, "AA":2}},
          {"F5":         {"GG":0, "GA":1, "AA":2}},
          {"F7":         {"GG":0, "GA":1, "AA":2}},
          {"F13":        {"GG":0, "GT":1, "TT":2}},
          {"ITGA2":      {"CC":0, "CT":1, "TT":2}},
          {"ITGB3":      {"TT":0, "TC":1, "CC":2}},
          {"PAI_1":      {"5G5G":0, "5G4G":1, "4G4G":2}},
          {"FGB":        {"GG":0, "GA":1, "AA":2}},
          {"MTHFR_677":  {"CC":0, "CT":1, "TT":2}},
          {"MTHFR_1298": {"AA":0, "AC":1, "CC":2}}

    ]

df_1 = pd.DataFrame()
for m in gen_map_1:    
    [(k,v)] = m.items()
    #print(k,v)
    df_1[k + '__1'] = df[k].map(v)

genetic_features__1 = list( df_1.columns )
print(genetic_features__1)

df_ext = pd.concat([df_ext, df_1], axis = 1)   # https://pandas.pydata.org/pandas-docs/stable/merging.html

df_1.head(3)

['F2__1', 'F5__1', 'F7__1', 'F13__1', 'ITGA2__1', 'ITGB3__1', 'PAI_1__1', 'FGB__1', 'MTHFR_677__1', 'MTHFR_1298__1']


Unnamed: 0,F2__1,F5__1,F7__1,F13__1,ITGA2__1,ITGB3__1,PAI_1__1,FGB__1,MTHFR_677__1,MTHFR_1298__1
0,0,0,1,0,1,0,0,0,2,0
1,0,0,0,0,0,0,1,1,0,1
2,0,0,1,0,0,0,2,0,1,1


##### Отображение генетических признаков  в массивы [0,1,2] <br> с помощью открытых баз знаний медицинских лабораторий: helix.ru, invitro.ru

In [14]:
gen_map_2 = [
          {"F2":          {"GG":0,   "GA":1,   "AA":2}},
          {"F5":          {"GG":0,   "GA":1,   "AA":2}},
          {"F7":          {"GG":0,   "GA":-1,  "AA":-2}},
          {"F13":         {"GG":0,   "GT":0,   "TT":-1}},
          {"ITGA2":       {"CC":0,   "CT":0,   "TT":1}},
          {"ITGB3":       {"TT":0,   "TC":1,   "CC":2}},
          {"PAI_1":       {"5G5G":0, "5G4G":0, "4G4G":1}},
          {"FGB":         {"GG":0,   "GA":1,   "AA":2}},
          {"MTHFR_677":   {"CC":0,   "CT":0,   "TT":1}},
          {"MTHFR_1298":  {"AA":0,   "AC":1,   "CC":2}}
        ]

df_2 = pd.DataFrame()
for m in gen_map_2:    
    [(k,v)] = m.items()
    df_2[k + '__2'] = df[k].map(v)

genetic_features__2 = list( df_2.columns )
print(genetic_features__2)

df_ext = pd.concat([df_ext, df_2], axis = 1)   # https://pandas.pydata.org/pandas-docs/stable/merging.html

df_2.head()

['F2__2', 'F5__2', 'F7__2', 'F13__2', 'ITGA2__2', 'ITGB3__2', 'PAI_1__2', 'FGB__2', 'MTHFR_677__2', 'MTHFR_1298__2']


Unnamed: 0,F2__2,F5__2,F7__2,F13__2,ITGA2__2,ITGB3__2,PAI_1__2,FGB__2,MTHFR_677__2,MTHFR_1298__2
0,0,0,-1,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,1,0,1
2,0,0,-1,0,0,0,1,0,0,1
3,0,0,0,0,0,1,0,1,0,0
4,0,0,0,0,1,1,0,1,0,1


## Гипотезы о комбинировании генетических признаков
Комбинации известных из базы знаний факторов: увеличения или снижения риска

In [15]:
genetic_guess = [
    {'GenRisk__1':['F2__1', 'F5__1', 'ITGA2__1', 'ITGB3__1', 'PAI_1__1', 'FGB__1', 'MTHFR_677__1', 'MTHFR_1298__1']},
    {'GenRisk__2':['F2__2', 'F5__2', 'ITGA2__2', 'ITGB3__2', 'PAI_1__2', 'FGB__2', 'MTHFR_677__2', 'MTHFR_1298__2']},
    {'GenProtect__1':['F7__1', 'F13__1']},
    {'GenProtect__2':['F7__2', 'F13__2']} ]

genetic_risk_features = ['GenRisk__1', 'GenRisk__2', 'GenProtect__1', 'GenProtect__2']

In [16]:
df_hyp = pd.DataFrame()
for g in genetic_guess:
    #print(g)       #print( g.items() )         #print( k,v )
    [(k,v)] = g.items()
    df_hyp[k] = np.array(df_ext[v].sum(axis=1))

df_ext = pd.concat([df_ext, df_hyp], axis=1)   # https://pandas.pydata.org/pandas-docs/stable/merging.html

df_hyp[genetic_risk_features].head(4)

Unnamed: 0,GenRisk__1,GenRisk__2,GenProtect__1,GenProtect__2
0,3,1,1,-1
1,3,2,0,0
2,4,2,1,-1
3,5,2,1,0


### Сохраняем обработанные данные

In [17]:
data_features_ext = combine_features(
                       [target_features, relatives_features, 
                        sex_features, sex_features_cat, 
                        bmi_features, bmi_features_scaled,
                        symptom_features, 
                        cirrhosis_features, 
                        debut_age_features, debut_age_features_scaled,
                        debut_organ_features,
                        genetic_features, genetic_features__1, genetic_features__2, genetic_features__dummy,
                        genetic_risk_features ],
                       exclude_features)

In [18]:
print(data_features_ext)

['TargetHead', 'TargetHeadRelativeMax', 'Sex', 'Sex M/F', 'BMI', 'Height', 'Mass', 'BMI_scaled', 'Height_scaled', 'Mass_scaled', 'KKF', 'Cirrhosis', 'ChildPugh', 'Advanced', 'Activity', 'DebutAge', 'DebutAge_scaled', 'DebutLiver', 'DebutNeuro', 'DebutKidney', 'DebutEndocr', 'DebutSibs', 'DebutVasku', 'DebutGemAnem', 'DebutSelez', 'DebutOther', 'F2', 'F5', 'F7', 'F13', 'ITGA2', 'ITGB3', 'PAI_1', 'FGB', 'MTHFR_677', 'MTHFR_1298', 'F2__1', 'F5__1', 'F7__1', 'F13__1', 'ITGA2__1', 'ITGB3__1', 'PAI_1__1', 'FGB__1', 'MTHFR_677__1', 'MTHFR_1298__1', 'F2__2', 'F5__2', 'F7__2', 'F13__2', 'ITGA2__2', 'ITGB3__2', 'PAI_1__2', 'FGB__2', 'MTHFR_677__2', 'MTHFR_1298__2', 'F2_GA', 'F2_GG', 'F5_GA', 'F5_GG', 'F7_GA', 'F7_GG', 'F13_GG', 'F13_GT', 'F13_TT', 'ITGA2_CC', 'ITGA2_CT', 'ITGA2_TT', 'ITGB3_TC', 'ITGB3_TT', 'PAI_1_4G4G', 'PAI_1_5G4G', 'PAI_1_5G5G', 'FGB_AA', 'FGB_GA', 'FGB_GG', 'MTHFR_677_CC', 'MTHFR_677_CT', 'MTHFR_677_TT', 'MTHFR_1298_AA', 'MTHFR_1298_AC', 'MTHFR_1298_CC', 'GenRisk__1', 'GenRis

#### Запись данных в файл: `pd.to_cvs`
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html
<br> https://stackoverflow.com/questions/27370046/closing-file-after-using-to-csv

In [19]:
outfile = open('../Wilson_2/Wilson_ext.csv', 'w')
df_ext[data_features_ext].to_csv(outfile, sep=';', index=False, encoding='utf-8', chunksize=1)
outfile.close()

**Чтение данных из сохранённого файла**

In [20]:
df_ext2 = pd.read_csv('../Wilson_2/Wilson_ext.csv', sep=';', encoding='utf-8') 
print(df_ext2.columns)

Index(['TargetHead', 'TargetHeadRelativeMax', 'Sex', 'Sex M/F', 'BMI',
       'Height', 'Mass', 'BMI_scaled', 'Height_scaled', 'Mass_scaled', 'KKF',
       'Cirrhosis', 'ChildPugh', 'Advanced', 'Activity', 'DebutAge',
       'DebutAge_scaled', 'DebutLiver', 'DebutNeuro', 'DebutKidney',
       'DebutEndocr', 'DebutSibs', 'DebutVasku', 'DebutGemAnem', 'DebutSelez',
       'DebutOther', 'F2', 'F5', 'F7', 'F13', 'ITGA2', 'ITGB3', 'PAI_1', 'FGB',
       'MTHFR_677', 'MTHFR_1298', 'F2__1', 'F5__1', 'F7__1', 'F13__1',
       'ITGA2__1', 'ITGB3__1', 'PAI_1__1', 'FGB__1', 'MTHFR_677__1',
       'MTHFR_1298__1', 'F2__2', 'F5__2', 'F7__2', 'F13__2', 'ITGA2__2',
       'ITGB3__2', 'PAI_1__2', 'FGB__2', 'MTHFR_677__2', 'MTHFR_1298__2',
       'F2_GA', 'F2_GG', 'F5_GA', 'F5_GG', 'F7_GA', 'F7_GG', 'F13_GG',
       'F13_GT', 'F13_TT', 'ITGA2_CC', 'ITGA2_CT', 'ITGA2_TT', 'ITGB3_TC',
       'ITGB3_TT', 'PAI_1_4G4G', 'PAI_1_5G4G', 'PAI_1_5G5G', 'FGB_AA',
       'FGB_GA', 'FGB_GG', 'MTHFR_677_CC', 'MTHFR_