## Key2

对于病理的任务，将病理的所有tiles转化成histogram或者tfidf的patient特征。

![](http://www.medai.icu/storage/attachments/2022/06/26/n41q4HeDjvIOZnyfoKH28c5YNioGdB7OZwO35XOf.png)

参考论文: [Development and interpretation of a pathomics-based model for the prediction of microsatellite instability in Colorectal Cancer](http://www.medai.icu/download?url=http://www.medai.icu/apiv3/attachment.download?sign=1667478d908313ae1e01543e229d02de&attachmentsId=1061&threadId=230)

In [None]:
import pandas as pd
from onekey_algo.custom.utils import key2
import numpy as np
import os
from onekey_algo import get_param_in_cwd

def map2n(x):
    try:
        return int(x)
    except:
        return 0
    
model = 'resnet18'
sm = {'inception_v3': 3, 'resnet50': 5, 'resnet101': 3, 'densenet121': 4, 'vgg19': 5}
model_root = os.path.join(get_param_in_cwd('data_root'), 'models')
train_log_path = rf'{model_root}/{model}/viz/predictions.csv'
# train_log_path = rf'{get_param_in_cwd("model_root")}/{model}/train/Epoch-{sm[model]}.txt'
train_log = pd.read_csv(train_log_path)
train_log['pred'] = train_log['pred'].map(lambda x: map2n(x))

log = pd.concat([train_log], axis=0)
log['prob'] = list(map(lambda x: x[0] if x[1] == 1 else 1-x[0], np.array(log[['prob', 'pred']])))
log['prob'] = log['prob'].round(decimals=2)
log[['group']] = log[['fname']].applymap(lambda x:os.path.basename(x).split('-')[0])
log

In [14]:
import pandas as pd
from onekey_algo.custom.utils import key2
import numpy as np
import os
from onekey_algo import get_param_in_cwd

def map2n(x):
    try:
        return int(x)
    except:
        return 0
    
model = 'resnet18'
sm = {'inception_v3': 3, 'resnet50': 5, 'resnet101': 3, 'densenet121': 4, 'vgg19': 5}
model_root = os.path.join(get_param_in_cwd('data_root'), 'models')
train_log_path = rf'{model_root}/{model}/viz/BST_TRAIN_RESULTS.txt'
# train_log_path = rf'{get_param_in_cwd("model_root")}/{model}/train/Epoch-{sm[model]}.txt'
train_log = pd.read_csv(train_log_path, names=['fname', 'prob', 'pred', 'gt'], sep='\t')
train_log['pred'] = train_log['pred'].map(lambda x: map2n(x))
val_log_path = rf'{model_root}/{model}/viz/BST_VAL_RESULTS.txt'
# val_log_path = rf'{get_param_in_cwd("model_root")}/{model}/valid/Epoch-{sm[model]}.txt'
val_log = pd.read_csv(val_log_path, names=['fname', 'prob', 'pred', 'gt'], sep='\t')
val_log['pred'] = val_log['pred'].map(lambda x: map2n(x))

aux_log_path = rf'{model_root}/{model}/viz/predictions.csv'
# val_log_path = rf'{get_param_in_cwd("model_root")}/{model}/valid/Epoch-{sm[model]}.txt'
aux_log = pd.read_csv(aux_log_path)
aux_log['pred'] = aux_log['pred'].map(lambda x: map2n(x))

log = pd.concat([train_log, val_log, aux_log], axis=0)
log['prob'] = list(map(lambda x: x[0] if x[1] == 1 else 1-x[0], np.array(log[['prob', 'pred']])))
log['prob'] = log['prob'].round(decimals=2)
log[['group']] = log[['fname']].applymap(lambda x:os.path.basename(x).split('_')[0])
log

TypeError: cannot concatenate object of type '<class 'str'>'; only Series and DataFrame objs are valid

In [15]:
log['pred'].value_counts()

Unnamed: 0,fname,prob,pred,gt,group
0,F:\20230915-BianYun\patches\1805286-A3\1805286...,0.60,1,1,1805286-A3
1,F:\20230915-BianYun\patches\1907669-C3\1907669...,0.53,1,1,1907669-C3
2,F:\20230915-BianYun\patches\1914886-C1\1914886...,0.80,1,1,1914886-C1
3,F:\20230915-BianYun\patches\1931031-A3\1931031...,0.44,0,0,1931031-A3
4,F:\20230915-BianYun\patches\1738059-A6\1738059...,0.52,1,1,1738059-A6
...,...,...,...,...,...
1025931,F:\\20230915-BianYun\patches\2319240-A7\231924...,0.93,1,1,2319240-A7
1025932,F:\\20230915-BianYun\patches\2319240-A7\231924...,0.86,1,1,2319240-A7
1025933,F:\\20230915-BianYun\patches\2319240-A7\231924...,0.67,1,1,2319240-A7
1025934,F:\\20230915-BianYun\patches\2319240-A7\231924...,0.75,1,1,2319240-A7


### 直方图

```python
def key2histogram(data: pd.DataFrame, group_column: str, histo_columns: Union[str, List[str]],
                  histo_lists: Union[list, List[list]] = None, default_value=0, norm: bool = False):
    """
    所有的数据生成直方图特征， 多个histo_columns存在是，所有的特征进行横向拼接。
    Args:
        data: 数据
        group_column: 样本分组的列明，一般为ID
        histo_columns: 用来计算直方图的列，如果为多列，则每列计算完直方图，然后特征拼接
        histo_lists: None或者与histo_columns个数相同，为自己指定特征列表
        default_value: 不存在特征时的默认值
        norm: 要不要归一化。
    Returns:

    """
```

In [16]:
import os

os.makedirs('features', exist_ok=True)
results = key2.key2histogram(log, group_column='group',histo_columns='prob', norm=True)
results.to_csv('features/path_prob_histogram.csv', header=True, index=False)
display(results)

results = key2.key2histogram(log, group_column='group',histo_columns='pred', norm=True)
results.to_csv('features/path_pred_histogram.csv', header=True, index=False)
display(results)

[2024-03-24 19:51:24 - key2.py:  59]	INFO	一共有1049个样本。
[2024-03-24 19:51:35 - key2.py:  68]	INFO	处理完成104个，占比9.91%
[2024-03-24 19:51:46 - key2.py:  68]	INFO	处理完成208个，占比19.83%
[2024-03-24 19:51:58 - key2.py:  68]	INFO	处理完成312个，占比29.74%
[2024-03-24 19:52:09 - key2.py:  68]	INFO	处理完成416个，占比39.66%
[2024-03-24 19:52:20 - key2.py:  68]	INFO	处理完成520个，占比49.57%
[2024-03-24 19:52:32 - key2.py:  68]	INFO	处理完成624个，占比59.49%
[2024-03-24 19:52:43 - key2.py:  68]	INFO	处理完成728个，占比69.40%
[2024-03-24 19:52:54 - key2.py:  68]	INFO	处理完成832个，占比79.31%
[2024-03-24 19:53:05 - key2.py:  68]	INFO	处理完成936个，占比89.23%
[2024-03-24 19:53:17 - key2.py:  68]	INFO	处理完成1040个，占比99.14%


Unnamed: 0,ID,prob-0.0,prob-0.01,prob-0.02,prob-0.03,prob-0.04,prob-0.05,prob-0.06,prob-0.07,prob-0.08,...,prob-0.91,prob-0.92,prob-0.93,prob-0.94,prob-0.95,prob-0.96,prob-0.97,prob-0.98,prob-0.99,prob-1.0
0,1238060-D,0.0,0.0,0.0,0.0,0.000e+00,0.000e+00,0.000e+00,0.000e+00,0.000e+00,...,0.017,0.012,0.009,1.012e-02,0.006,0.008,0.004,0.001,5.951e-04,0.000e+00
1,1240023-C2,0.0,0.0,0.0,0.0,3.349e-04,3.349e-04,6.698e-04,3.349e-04,2.009e-03,...,0.000,0.000,0.000,3.349e-04,0.000,0.000,0.000,0.000,0.000e+00,0.000e+00
2,1310929-C2,0.0,0.0,0.0,0.0,0.000e+00,0.000e+00,0.000e+00,0.000e+00,4.067e-04,...,0.007,0.012,0.005,4.473e-03,0.009,0.009,0.004,0.004,4.067e-04,0.000e+00
3,1317999-D1,0.0,0.0,0.0,0.0,0.000e+00,0.000e+00,0.000e+00,0.000e+00,0.000e+00,...,0.015,0.012,0.015,1.794e-02,0.016,0.013,0.007,0.006,3.139e-03,0.000e+00
4,1320927-C3,0.0,0.0,0.0,0.0,0.000e+00,0.000e+00,0.000e+00,0.000e+00,3.171e-04,...,0.022,0.024,0.022,2.790e-02,0.034,0.037,0.039,0.034,1.585e-02,6.024e-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1044,2318751-C3,0.0,0.0,0.0,0.0,0.000e+00,0.000e+00,0.000e+00,0.000e+00,0.000e+00,...,0.024,0.023,0.023,1.871e-02,0.019,0.019,0.013,0.007,5.513e-03,3.341e-04
1045,2318753-C4,0.0,0.0,0.0,0.0,0.000e+00,0.000e+00,0.000e+00,0.000e+00,0.000e+00,...,0.024,0.019,0.016,1.492e-02,0.015,0.014,0.010,0.005,1.119e-03,0.000e+00
1046,2318951-C1,0.0,0.0,0.0,0.0,0.000e+00,0.000e+00,0.000e+00,0.000e+00,0.000e+00,...,0.029,0.035,0.041,3.307e-02,0.047,0.045,0.056,0.068,8.345e-02,4.495e-02
1047,2318954-A4,0.0,0.0,0.0,0.0,0.000e+00,0.000e+00,0.000e+00,0.000e+00,0.000e+00,...,0.005,0.006,0.006,5.995e-03,0.006,0.005,0.004,0.002,2.067e-04,0.000e+00


[2024-03-24 19:53:20 - key2.py:  59]	INFO	一共有1049个样本。
[2024-03-24 19:53:32 - key2.py:  68]	INFO	处理完成104个，占比9.91%
[2024-03-24 19:53:43 - key2.py:  68]	INFO	处理完成208个，占比19.83%
[2024-03-24 19:53:54 - key2.py:  68]	INFO	处理完成312个，占比29.74%
[2024-03-24 19:54:05 - key2.py:  68]	INFO	处理完成416个，占比39.66%
[2024-03-24 19:54:16 - key2.py:  68]	INFO	处理完成520个，占比49.57%
[2024-03-24 19:54:27 - key2.py:  68]	INFO	处理完成624个，占比59.49%
[2024-03-24 19:54:39 - key2.py:  68]	INFO	处理完成728个，占比69.40%
[2024-03-24 19:54:50 - key2.py:  68]	INFO	处理完成832个，占比79.31%
[2024-03-24 19:55:01 - key2.py:  68]	INFO	处理完成936个，占比89.23%
[2024-03-24 19:55:12 - key2.py:  68]	INFO	处理完成1040个，占比99.14%


Unnamed: 0,ID,pred-0,pred-1
0,1238060-D,0.020,0.980
1,1240023-C2,0.780,0.220
2,1310929-C2,0.080,0.920
3,1317999-D1,0.053,0.947
4,1320927-C3,0.044,0.956
...,...,...,...
1044,2318751-C3,0.127,0.873
1045,2318753-C4,0.055,0.945
1046,2318951-C1,0.008,0.992
1047,2318954-A4,0.378,0.622


### TF-IDF

```python
def key2tfidf(data: pd.DataFrame, group_column: str, corpus_columns: Union[str, List[str]]):
    """
    所有的数据生成直方图特征， 多个corpus_columns存在是，所有的特征进行横向拼接。
    Args:
        data: 数据
        group_column: 样本分组的列明，一般为ID
        corpus_columns: 用来计算作为语料的列明。
    Returns:

    """
```

In [17]:
results = key2.key2tfidf(log, group_column='group',corpus_columns='prob')
results.to_csv('features/path_prob_tfidf.csv', header=True, index=False)
display(results)

results = key2.key2tfidf(log, group_column='group',corpus_columns='pred')
results.to_csv('features/path_pred_tfidf.csv', header=True, index=False)
display(results)

Unnamed: 0,ID,prob00,prob001,prob002,prob003,prob004,prob005,prob006,prob007,prob008,...,prob091,prob092,prob093,prob094,prob095,prob096,prob097,prob098,prob099,prob10
1238060-D,1238060-D,0.0,0.0,0.0,0.0,0.000,0.000,0.000,0.000,0.000,...,0.122,0.085,0.068,0.077,0.047,0.067,0.030,0.011,0.006,0.000
1240023-C2,1240023-C2,0.0,0.0,0.0,0.0,0.005,0.005,0.009,0.004,0.024,...,0.000,0.000,0.000,0.002,0.000,0.000,0.000,0.000,0.000,0.000
1310929-C2,1310929-C2,0.0,0.0,0.0,0.0,0.000,0.000,0.000,0.000,0.006,...,0.056,0.092,0.042,0.037,0.081,0.079,0.041,0.036,0.005,0.000
1317999-D1,1317999-D1,0.0,0.0,0.0,0.0,0.000,0.000,0.000,0.000,0.000,...,0.114,0.094,0.118,0.148,0.135,0.110,0.062,0.057,0.036,0.000
1320927-C3,1320927-C3,0.0,0.0,0.0,0.0,0.000,0.000,0.000,0.000,0.004,...,0.152,0.171,0.159,0.208,0.266,0.294,0.322,0.297,0.165,0.084
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2318751-C3,2318751-C3,0.0,0.0,0.0,0.0,0.000,0.000,0.000,0.000,0.000,...,0.188,0.182,0.180,0.155,0.167,0.166,0.124,0.069,0.064,0.005
2318753-C4,2318753-C4,0.0,0.0,0.0,0.0,0.000,0.000,0.000,0.000,0.000,...,0.177,0.139,0.124,0.117,0.122,0.117,0.084,0.043,0.012,0.000
2318951-C1,2318951-C1,0.0,0.0,0.0,0.0,0.000,0.000,0.000,0.000,0.000,...,0.126,0.154,0.181,0.152,0.226,0.219,0.290,0.373,0.536,0.389
2318954-A4,2318954-A4,0.0,0.0,0.0,0.0,0.000,0.000,0.000,0.000,0.000,...,0.029,0.036,0.041,0.041,0.043,0.038,0.030,0.020,0.002,0.000


Unnamed: 0,ID,pred0,pred1
1238060-D,1238060-D,0.020,1.000
1240023-C2,1240023-C2,0.963,0.270
1310929-C2,1310929-C2,0.087,0.996
1317999-D1,1317999-D1,0.056,0.998
1320927-C3,1320927-C3,0.046,0.999
...,...,...,...
2318751-C3,2318751-C3,0.144,0.990
2318753-C4,2318753-C4,0.058,0.998
2318951-C1,2318951-C1,0.008,1.000
2318954-A4,2318954-A4,0.520,0.854
