## 统计分析

通过指定统计分析字段，得到每个特征的p_value，所有的p_value计算都是基于Ttest计算。支持指定不同的分组`group`，例如train、val、test等分组统计。

对于两大类不同的特征

1. 离散特征，统计数量以及占比。
2. 连续特征，统计均值、方差。

In [1]:
import pandas as pd
import numpy as np
from onekey_algo import OnekeyDS as okds
from onekey_algo import get_param_in_cwd
from onekey_algo.custom.utils import print_join_info

task = get_param_in_cwd('task_column') or 'label'
# 修改成自己临床数据的文件。
test_data = pd.read_csv(get_param_in_cwd('clinic_file') or okds.survival, dtype={'ID': str})
stats_columns = get_param_in_cwd('stats_columns')
continuous_columns = get_param_in_cwd('continuous_columns')
test_data = test_data[[c for c in test_data.columns if c not in  [task]]]
test_data = test_data[['ID'] + [c for c in test_data.columns if c != 'ID']]
group_info = pd.read_csv('data/group.csv', dtype={'ID': str})
print_join_info(test_data, group_info)
test_data = pd.merge(test_data, group_info, on='ID', how='inner')
test_data



Unnamed: 0,ID,T,N,M,Degree,DU,BILE,胰周脂肪,脉管癌栓,胰周神经,胰内神经,Sex,Age,OSTime,OS,group
0,1238060,,,,中分化,0.0,0.0,,,,,男,56,4.50,0,train
1,1240023,,,,中分化,0.0,0.0,,,,1.0,男,39,28.47,0,train
2,1310929,,,,中分化,1.0,1.0,,,,1.0,女,65,5.73,1,test
3,1320927,,,,中分化,1.0,0.0,,,,,男,56,7.30,1,test
4,1322885,,,,中分化,0.0,0.0,,,,,男,68,4.87,0,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,2017433,2.0,0,,中分化,1.0,1.0,1.0,0.0,1.0,1.0,男,48,22.23,1,test
696,2018527,1.0,0,,低至中分化,0.0,1.0,1.0,0.0,1.0,1.0,男,78,5.07,0,train
697,2018539,4.0,0,,低至中分化,0.0,0.0,1.0,1.0,1.0,1.0,男,71,16.73,1,train
698,2018734,3.0,2,,低至中分化,0.0,1.0,1.0,1.0,1.0,1.0,女,72,10.93,1,test


In [2]:
test_data['group'].value_counts()

train    490
test     210
Name: group, dtype: int64

In [3]:
def map_cnames(x):
    x = x.strip().split(':')[-1]
    x = x.replace('-', '_').replace(' ', '_').replace('>', '').replace('/', '_')
    x = x.split('(')[0].strip('_')
    return x

test_data.columns = list(map(map_cnames, test_data.columns))
test_data.columns

Index(['ID', 'T', 'N', 'M', 'Degree', 'DU', 'BILE', '胰周脂肪', '脉管癌栓', '胰周神经',
       '胰内神经', 'Sex', 'Age', 'OSTime', 'OS', 'group'],
      dtype='object')

In [4]:
mapping_columns = [c for c in test_data.columns[1:-3] if test_data[c].dtype == object]
mapping_columns

['N', 'Degree', 'Sex']

# 数据映射

针对所有非数值形式的数据，可以进行类别映射。

In [5]:
from onekey_algo.custom.utils import map2numerical

data, mapping = map2numerical(test_data, mapping_columns=mapping_columns)
mapping

{'N': {'0': 0, '1': 1, '2': 2, 'A': 3},
 'Degree': {'中分化': 0, '中至高分化': 1, '低分化': 2, '低至中分化': 3, '高分化': 4},
 'Sex': {'女': 0, '男': 1}}

In [6]:
stats_columns = list(test_data.columns)[1:-3]
continuous_columns = []
for c in stats_columns:
    if len(np.unique(test_data[c])) > 6 or not np.int8 <= test_data[c].dtype <= np.int64 and sum(np.unique(test_data[c])) > 3:
#         print(c, np.unique(test_data[c]), np.int8 <= test_data[c].dtype <= np.int64)
        continuous_columns.append(c)
        
continuous_columns

['Age']

# 缺失值填充

In [7]:
import os
from onekey_algo.custom.components.comp1 import fillna
os.makedirs('data', exist_ok=True)
data = fillna(data, fill_mod='50%')
data.to_csv('data/clinical.csv', index=False)
data

Unnamed: 0,ID,T,N,M,Degree,DU,BILE,胰周脂肪,脉管癌栓,胰周神经,胰内神经,Sex,Age,OSTime,OS,group
0,1238060,2.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1,56,4.50,0,train
1,1240023,2.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1,39,28.47,0,train
2,1310929,2.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0,65,5.73,1,test
3,1320927,2.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1,56,7.30,1,test
4,1322885,2.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1,68,4.87,0,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,2017433,2.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1,48,22.23,1,test
696,2018527,1.0,0.0,1.0,3.0,0.0,1.0,1.0,0.0,1.0,1.0,1,78,5.07,0,train
697,2018539,4.0,0.0,1.0,3.0,0.0,0.0,1.0,1.0,1.0,1.0,1,71,16.73,1,train
698,2018734,3.0,2.0,1.0,3.0,0.0,1.0,1.0,1.0,1.0,1.0,0,72,10.93,1,test


### 输出格式
支持两种格式数据，分别对应`pretty`参数的`True`和`False`, 当为`True`时，输出的是表格模式，反之则为dict数据。

```python
def clinic_stats(data: DataFrame, stats_columns: Union[str, List[str]], label_column='label',
                 group_column: str = None, continuous_columns: Union[str, List[str]] = None,
                 pretty: bool = True) -> Union[dict, DataFrame]:
    """

    Args:
        data: 数据
        stats_columns: 需要统计的列名
        label_column: 二分类的标签列，默认`label`
        group_column: 分组统计依据，例如区分训练组、测试组、验证组。
        continuous_columns: 那些列是连续变量，连续变量统计均值方差。
        pretty: bool, 是否对结果进行格式美化。

    Returns:
        stats DataFrame or json

    """
```

In [8]:
from onekey_algo.custom.components.stats import clinic_stats

pd.set_option('display.max_rows', None)
stats = clinic_stats(data, 
                     stats_columns= stats_columns,
                     label_column='group', 
                     group_column=None, 
                     continuous_columns= continuous_columns, 
                     pretty=True, verbose=False)
stats.to_csv('stats_sep.csv', index=False, encoding='utf_8_sig')
stats

Unnamed: 0,feature_name,-label=ALL,-label=test,-label=train,pvalue
0,Age,61.77±9.62,61.98±9.54,61.68±9.66,0.605
0,T,,,,0.525
1,1.0,70(10.00),18(8.57),52(10.61),
2,2.0,300(42.86),87(41.43),213(43.47),
3,3.0,324(46.29),102(48.57),222(45.31),
4,4.0,6(0.86),3(1.43),3(0.61),
5,N,,,,0.187
6,0.0,275(39.29),73(34.76),202(41.22),
7,1.0,304(43.43),97(46.19),207(42.24),
8,2.0,108(15.43),38(18.10),70(14.29),
