Data preprocessing and cleaning

In [19]:
from sklearn.impute import SimpleImputer
import os
import pandas as pd
import numpy as np

np.set_printoptions(suppress=True,   precision=20,  threshold=10,  linewidth=40) # np禁止科学计数法显示
pd.set_option('display.float_format',lambda x : '%.2f' % x) # pd禁止科学计数法显示

In [20]:
path = "data/train_data.csv"

df_all = pd.read_csv(path) # 读取数据，根据你自己文件地址编写

df_all.drop_duplicates(inplace=True) # 使用drop_duplicates去重，inplace=True对原数据集进行替换
df_all.reset_index(drop=True, inplace=True) # 删除数据后，恢复索引

df_all

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,1,8,c,3,Z,3,radiotherapy,R,F,2.00,31397,7.00,Emergency,Extreme,2,51-60,4911.00,0-10
1,2,2,c,5,Z,2,radiotherapy,S,F,2.00,31397,7.00,Trauma,Extreme,2,51-60,5954.00,41-50
2,3,10,e,1,X,2,anesthesia,S,E,2.00,31397,7.00,Trauma,Extreme,2,51-60,4745.00,31-40
3,4,26,b,2,Y,2,radiotherapy,R,D,2.00,31397,7.00,Trauma,Extreme,2,51-60,7272.00,41-50
4,5,26,b,2,Y,2,radiotherapy,S,D,2.00,31397,7.00,Trauma,Extreme,2,51-60,5558.00,41-50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318433,318434,6,a,6,X,3,radiotherapy,Q,F,4.00,86499,23.00,Emergency,Moderate,3,41-50,4144.00,11-20
318434,318435,24,a,1,X,2,anesthesia,Q,E,4.00,325,8.00,Urgent,Moderate,4,81-90,6699.00,31-40
318435,318436,7,a,4,X,3,gynecology,R,F,4.00,125235,10.00,Emergency,Minor,3,71-80,4235.00,11-20
318436,318437,11,b,2,Y,3,anesthesia,Q,D,3.00,91081,8.00,Trauma,Minor,5,11-20,3761.00,11-20


In [21]:
# 整体查看数据类型 以及缺失情况
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318438 entries, 0 to 318437
Data columns (total 18 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   case_id                            318438 non-null  int64  
 1   Hospital_code                      318438 non-null  int64  
 2   Hospital_type_code                 318438 non-null  object 
 3   City_Code_Hospital                 318438 non-null  int64  
 4   Hospital_region_code               318438 non-null  object 
 5   Available Extra Rooms in Hospital  318438 non-null  int64  
 6   Department                         318438 non-null  object 
 7   Ward_Type                          318438 non-null  object 
 8   Ward_Facility_Code                 318438 non-null  object 
 9   Bed Grade                          318325 non-null  float64
 10  patientid                          318438 non-null  int64  
 11  City_Code_Patient                  3139

In [22]:
# 定义查找缺失变量函数，返回 缺失值率> 指定缺失率(narate) 的列表
def filter_col_by_nan(data, narate=0.2):
    '''
    :param data: 查找数据集
    :param narate: 设定变量的缺失值率，默认20%
    :return: 返回 缺失率>narate的变量名称列表
    '''
    n_samples = data.shape[0]
    list_nan_cols = []
    for col in data.columns:
        if data[col].isna().sum() / n_samples >= (narate):
            list_nan_cols.append(col)

    print(f'缺失量在{narate * 100}%以上的变量有:{list_nan_cols}')

    return list_nan_cols

list_nullfactor_todrop = filter_col_by_nan(df_all, narate=0.2)

df_select = df_all.drop(list_nullfactor_todrop, axis=1).copy()
df_select

缺失量在20.0%以上的变量有:[]


Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,1,8,c,3,Z,3,radiotherapy,R,F,2.00,31397,7.00,Emergency,Extreme,2,51-60,4911.00,0-10
1,2,2,c,5,Z,2,radiotherapy,S,F,2.00,31397,7.00,Trauma,Extreme,2,51-60,5954.00,41-50
2,3,10,e,1,X,2,anesthesia,S,E,2.00,31397,7.00,Trauma,Extreme,2,51-60,4745.00,31-40
3,4,26,b,2,Y,2,radiotherapy,R,D,2.00,31397,7.00,Trauma,Extreme,2,51-60,7272.00,41-50
4,5,26,b,2,Y,2,radiotherapy,S,D,2.00,31397,7.00,Trauma,Extreme,2,51-60,5558.00,41-50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318433,318434,6,a,6,X,3,radiotherapy,Q,F,4.00,86499,23.00,Emergency,Moderate,3,41-50,4144.00,11-20
318434,318435,24,a,1,X,2,anesthesia,Q,E,4.00,325,8.00,Urgent,Moderate,4,81-90,6699.00,31-40
318435,318436,7,a,4,X,3,gynecology,R,F,4.00,125235,10.00,Emergency,Minor,3,71-80,4235.00,11-20
318436,318437,11,b,2,Y,3,anesthesia,Q,D,3.00,91081,8.00,Trauma,Minor,5,11-20,3761.00,11-20


In [23]:
list_factor_todrop = ['case_id', 'patientid'] #不需要入模的变量列表

df_select.drop(list_factor_todrop, axis=1, inplace=True)

df_select

Unnamed: 0,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,8,c,3,Z,3,radiotherapy,R,F,2.00,7.00,Emergency,Extreme,2,51-60,4911.00,0-10
1,2,c,5,Z,2,radiotherapy,S,F,2.00,7.00,Trauma,Extreme,2,51-60,5954.00,41-50
2,10,e,1,X,2,anesthesia,S,E,2.00,7.00,Trauma,Extreme,2,51-60,4745.00,31-40
3,26,b,2,Y,2,radiotherapy,R,D,2.00,7.00,Trauma,Extreme,2,51-60,7272.00,41-50
4,26,b,2,Y,2,radiotherapy,S,D,2.00,7.00,Trauma,Extreme,2,51-60,5558.00,41-50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318433,6,a,6,X,3,radiotherapy,Q,F,4.00,23.00,Emergency,Moderate,3,41-50,4144.00,11-20
318434,24,a,1,X,2,anesthesia,Q,E,4.00,8.00,Urgent,Moderate,4,81-90,6699.00,31-40
318435,7,a,4,X,3,gynecology,R,F,4.00,10.00,Emergency,Minor,3,71-80,4235.00,11-20
318436,11,b,2,Y,3,anesthesia,Q,D,3.00,8.00,Trauma,Minor,5,11-20,3761.00,11-20


In [24]:
# 定义函数，返回离散型函数变量名列表, 并查看映射值数量情况
def get_objectfac_list(data ,print_value=False):
    lt_discrete = []
    for col in data.columns:
        if str(data[col].dtype) == 'object':
            lt_discrete.append(col)
            if print_value == True:
                print(data[col].value_counts())
                print(f'-' * 50)
    return lt_discrete


#list_discrete = get_objectfac_list(df_select,False)

# 情况3，删除离散型变量，有空值的行
df_select.dropna(axis=0,inplace=True)
df_select.reset_index(drop=True, inplace=True)

df_select

Unnamed: 0,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,8,c,3,Z,3,radiotherapy,R,F,2.00,7.00,Emergency,Extreme,2,51-60,4911.00,0-10
1,2,c,5,Z,2,radiotherapy,S,F,2.00,7.00,Trauma,Extreme,2,51-60,5954.00,41-50
2,10,e,1,X,2,anesthesia,S,E,2.00,7.00,Trauma,Extreme,2,51-60,4745.00,31-40
3,26,b,2,Y,2,radiotherapy,R,D,2.00,7.00,Trauma,Extreme,2,51-60,7272.00,41-50
4,26,b,2,Y,2,radiotherapy,S,D,2.00,7.00,Trauma,Extreme,2,51-60,5558.00,41-50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
313788,6,a,6,X,3,radiotherapy,Q,F,4.00,23.00,Emergency,Moderate,3,41-50,4144.00,11-20
313789,24,a,1,X,2,anesthesia,Q,E,4.00,8.00,Urgent,Moderate,4,81-90,6699.00,31-40
313790,7,a,4,X,3,gynecology,R,F,4.00,10.00,Emergency,Minor,3,71-80,4235.00,11-20
313791,11,b,2,Y,3,anesthesia,Q,D,3.00,8.00,Trauma,Minor,5,11-20,3761.00,11-20


In [25]:
# 需要对数据进行划分

# ① 取数值、连续类型的数据
list_train_num = ['Available Extra Rooms in Hospital','Visitors with Patient','Admission_Deposit']

# ②取文本/离散、无需独热编码 类型的数据（类似 住宅类型、就业类型 等字段）
list_train_str = ['Bed Grade','Type of Admission','Type of Admission','Age','Stay']

# ③取文本/离散、需 独热编码 类型的数据（类似 教育水平分类 等变量）
list_train_str_needtrf = ['Hospital_code','Hospital_type_code','City_Code_Hospital','Hospital_region_code','Department','Ward_Type','Ward_Facility_Code','City_Code_Patient','Type of Admission','Severity of Illness']

# 查看训练集空值情况（此时只剩数值型空值，其他类型的数据都被处理了）
df_select[df_select.isnull().any(axis=1)].head()
df_select

Unnamed: 0,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,8,c,3,Z,3,radiotherapy,R,F,2.00,7.00,Emergency,Extreme,2,51-60,4911.00,0-10
1,2,c,5,Z,2,radiotherapy,S,F,2.00,7.00,Trauma,Extreme,2,51-60,5954.00,41-50
2,10,e,1,X,2,anesthesia,S,E,2.00,7.00,Trauma,Extreme,2,51-60,4745.00,31-40
3,26,b,2,Y,2,radiotherapy,R,D,2.00,7.00,Trauma,Extreme,2,51-60,7272.00,41-50
4,26,b,2,Y,2,radiotherapy,S,D,2.00,7.00,Trauma,Extreme,2,51-60,5558.00,41-50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
313788,6,a,6,X,3,radiotherapy,Q,F,4.00,23.00,Emergency,Moderate,3,41-50,4144.00,11-20
313789,24,a,1,X,2,anesthesia,Q,E,4.00,8.00,Urgent,Moderate,4,81-90,6699.00,31-40
313790,7,a,4,X,3,gynecology,R,F,4.00,10.00,Emergency,Minor,3,71-80,4235.00,11-20
313791,11,b,2,Y,3,anesthesia,Q,D,3.00,8.00,Trauma,Minor,5,11-20,3761.00,11-20


In [26]:
# 数据清洗函数定义
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.utils import check_array
from scipy import sparse


class CategoricalEncoder(BaseEstimator, TransformerMixin):
    """将分类特征编码为数字数组。
    此函数输入 分类的整数矩阵 或 字符串矩阵,
    将把分类（离散）特征所具有的值转化为数组

    """

    def __init__(self, encoding='onehot', categories='auto', dtype=np.float64,
                 handle_unknown='error'):
        self.encoding = encoding
        self.categories = categories
        self.dtype = dtype
        self.handle_unknown = handle_unknown

    def fit(self, X, y=None):
        """Fit the CategoricalEncoder to X.
        Parameters
        ----------
        X : array-like, shape [n_samples, n_feature]
            The data to determine the categories of each feature.
        Returns
        -------
        self
        """

        # 报错预警
        if self.encoding not in ['onehot', 'onehot-dense', 'ordinal']:
            template = ("encoding should be either 'onehot', 'onehot-dense' "
                        "or 'ordinal', got %s")
            raise ValueError(template % self.handle_unknown)

        if self.handle_unknown not in ['error', 'ignore']:
            template = ("handle_unknown should be either 'error' or "
                        "'ignore', got %s")
            raise ValueError(template % self.handle_unknown)

        if self.encoding == 'ordinal' and self.handle_unknown == 'ignore':
            raise ValueError("handle_unknown='ignore' is not supported for"
                             " encoding='ordinal'")

        X = check_array(X, dtype=np.object, accept_sparse='csc', copy=True)
        n_samples, n_features = X.shape # n_samples 样本数，n_features 特征数

        self._label_encoders_ = [LabelEncoder() for n_f in range(n_features)]

        for i in range(n_features):
            le = self._label_encoders_[i]
            Xi = X[:, i]
            if self.categories == 'auto':
                le.fit(Xi)
            else:
                valid_mask = np.in1d(Xi, self.categories[i])
                if not np.all(valid_mask):
                    if self.handle_unknown == 'error':
                        diff = np.unique(Xi[~valid_mask])
                        msg = ("Found unknown categories {0} in column {1}"
                               " during fit".format(diff, i))
                        raise ValueError(msg)
                le.classes_ = np.array(np.sort(self.categories[i]))

        self.categories_ = [le.classes_ for le in self._label_encoders_]

        return self

    def transform(self, X):
        """Transform X using one-hot encoding.
        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data to encode.
        Returns
        -------
        X_out : sparse matrix or a 2-d array
            Transformed input.
        """
        X = check_array(X, accept_sparse='csc', dtype=np.object, copy=True)
        n_samples, n_features = X.shape


        X_int = np.zeros_like(X, dtype=np.int) # 构建一个和 X 维度相同的(一毛一样)  整数零矩阵
        X_mask = np.ones_like(X, dtype=np.bool) # 构建一个和 X 维度相同的(一毛一样)   布尔矩阵

        for i in range(n_features): # 对每个变量开始循环
            valid_mask = np.in1d(X[:, i], self.categories_[i]) # 每个变量的每一行，去验证是否是是此变量unqiue中的一个，是返回True

            if not np.all(valid_mask):
                if self.handle_unknown == 'error':
                    diff = np.unique(X[~valid_mask, i])
                    msg = ("Found unknown categories {0} in column {1}"
                           " during transform".format(diff, i))
                    raise ValueError(msg)
                else:
                    # Set the problematic rows to an acceptable value and
                    # continue `The rows are marked `X_mask` and will be
                    # removed later.
                    X_mask[:, i] = valid_mask # unique矩阵赋予X_mask
                    X[:, i][~valid_mask] = self.categories_[i][0]
            X_int[:, i] = self._label_encoders_[i].transform(X[:, i])

        if self.encoding == 'ordinal':
            return X_int.astype(self.dtype, copy=False)

        mask = X_mask.ravel() # .ravel()将矩阵向量化

        n_values = [cats.shape[0] for cats in self.categories_]

        n_values = np.array([0] + n_values)
        indices = np.cumsum(n_values)

        column_indices = (X_int + indices[:-1]).ravel()[mask] # 找到该变量某个离散值中的所有的列索引
        row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
                                n_features)[mask]
        data = np.ones(n_samples * n_features)[mask]


        out = sparse.csc_matrix((data, (row_indices, column_indices)),
                                shape=(n_samples, indices[-1]),
                                dtype=self.dtype).tocsr()
        # out = out[:,1:] # 这里为one_hot,如果要转换成哑变量需要将状态进行k-1删除，防止虚拟陷阱！

        if self.encoding == 'onehot-dense':
            return out.toarray()
        else:
            return out

In [27]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [28]:
class ExeLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._stretltype = 1

    def fit(self, X, y=None):
        return self

    def transform(self, X):

        encoder = LabelEncoder()
        n_samples, n_features = X.shape
        arr = np.zeros_like(X, dtype=np.int)

        for whlist in range(X.shape[1]):
            arr[:, whlist] = encoder.fit_transform(X[:, whlist])
        return arr

In [29]:
from sklearn.model_selection import train_test_split

# 如果为监督学习则需要复制标签，如果无监督学习则不需要下方复制标签的代码
df_select_labels = df_select["Stay"].copy() # 复制标签

In [30]:

# 30%数据做测试集
Xtrain, Xtest, Ytrain, Ytest = train_test_split(df_select, df_select_labels, test_size=0.3, random_state=42)

In [31]:
Xtrain

Unnamed: 0,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
291956,3,c,3,Z,6,radiotherapy,Q,A,3.00,6.00,Emergency,Moderate,10,51-60,2903.00,51-60
74686,28,b,11,X,2,gynecology,R,F,1.00,2.00,Urgent,Extreme,3,41-50,4530.00,31-40
255553,23,a,6,X,2,radiotherapy,R,F,3.00,4.00,Trauma,Minor,2,61-70,4469.00,11-20
23023,29,a,4,X,4,gynecology,S,F,2.00,2.00,Trauma,Moderate,4,31-40,6094.00,91-100
290082,22,g,9,Y,6,gynecology,R,B,3.00,14.00,Trauma,Moderate,2,51-60,5981.00,11-20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119879,19,a,7,Y,2,gynecology,S,C,2.00,8.00,Emergency,Moderate,10,41-50,3396.00,More than 100 Days
259178,26,b,2,Y,3,gynecology,R,D,3.00,7.00,Trauma,Minor,5,21-30,5619.00,71-80
131932,12,a,9,Y,4,radiotherapy,R,B,3.00,23.00,Emergency,Moderate,6,41-50,8046.00,61-70
146867,32,f,9,Y,3,gynecology,S,B,3.00,9.00,Emergency,Minor,2,11-20,5441.00,11-20


In [32]:
Xtest

Unnamed: 0,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
106277,6,a,6,X,4,gynecology,Q,F,2.00,15.00,Trauma,Extreme,2,41-50,7446.00,21-30
48909,10,e,1,X,3,gynecology,S,E,1.00,25.00,Trauma,Moderate,2,21-30,5453.00,11-20
109603,11,b,2,Y,4,gynecology,Q,D,2.00,8.00,Trauma,Extreme,4,51-60,4189.00,31-40
9061,30,c,3,Z,3,TB & Chest disease,R,A,3.00,11.00,Trauma,Minor,3,71-80,5464.00,41-50
172242,30,c,3,Z,4,gynecology,Q,A,2.00,8.00,Trauma,Moderate,2,21-30,4595.00,41-50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58777,14,a,1,X,2,gynecology,S,E,3.00,2.00,Emergency,Minor,2,31-40,3585.00,0-10
23646,28,b,11,X,2,gynecology,R,F,2.00,12.00,Trauma,Extreme,2,41-50,6708.00,11-20
232739,23,a,6,X,5,gynecology,Q,F,3.00,2.00,Trauma,Minor,4,21-30,4489.00,0-10
204119,14,a,1,X,4,gynecology,Q,E,3.00,4.00,Emergency,Minor,4,51-60,6554.00,11-20


In [33]:
Ytrain

291956                 51-60
74686                  31-40
255553                 11-20
23023                 91-100
290082                 11-20
                 ...        
119879    More than 100 Days
259178                 71-80
131932                 61-70
146867                 11-20
121958                 31-40
Name: Stay, Length: 219655, dtype: object

In [34]:
Ytest

106277    21-30
48909     11-20
109603    31-40
9061      41-50
172242    41-50
          ...  
58777      0-10
23646     11-20
232739     0-10
204119    11-20
10039     11-20
Name: Stay, Length: 94138, dtype: object

In [35]:
# 利用Pipeline定义不同变量清洗过程
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler # 数据标准化

# 连续型：处理顺序为1.DataFrameSelector 选择数据集，2.SimpleImputer填充缺失值，3.StandardScaler进行标准化数据
num_pipeline = Pipeline([
    ('selector', DataFrameSelector(list_train_num)),
    ('simple_imputer', SimpleImputer(strategy="mean")),# median为中位数，most_frequent为众数。constant表示将空值填充为自定义的值，但这个自定义的值要通过fill_value来定义
    ('std_scaler', StandardScaler()), # 进行标准化
    ])

# 离散，将数据转化为 数字
cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(list_train_str)),
    ('label_encoder', ExeLabelEncoder()), # 进行 ExeLabelEncoder
    ])

# 离散，将数据转化为 数字，并进行onehot编码
cat_onehot_pipeline = Pipeline([ # 类别需要onehot编码
    ('selector', DataFrameSelector(list_train_str_needtrf)),
    ('cat_encoder', CategoricalEncoder(encoding="onehot-dense")),
    ])

Xtrain


Unnamed: 0,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
291956,3,c,3,Z,6,radiotherapy,Q,A,3.00,6.00,Emergency,Moderate,10,51-60,2903.00,51-60
74686,28,b,11,X,2,gynecology,R,F,1.00,2.00,Urgent,Extreme,3,41-50,4530.00,31-40
255553,23,a,6,X,2,radiotherapy,R,F,3.00,4.00,Trauma,Minor,2,61-70,4469.00,11-20
23023,29,a,4,X,4,gynecology,S,F,2.00,2.00,Trauma,Moderate,4,31-40,6094.00,91-100
290082,22,g,9,Y,6,gynecology,R,B,3.00,14.00,Trauma,Moderate,2,51-60,5981.00,11-20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119879,19,a,7,Y,2,gynecology,S,C,2.00,8.00,Emergency,Moderate,10,41-50,3396.00,More than 100 Days
259178,26,b,2,Y,3,gynecology,R,D,3.00,7.00,Trauma,Minor,5,21-30,5619.00,71-80
131932,12,a,9,Y,4,radiotherapy,R,B,3.00,23.00,Emergency,Moderate,6,41-50,8046.00,61-70
146867,32,f,9,Y,3,gynecology,S,B,3.00,9.00,Emergency,Minor,2,11-20,5441.00,11-20


In [36]:
# 定义FeatureUnion 进行流水化清洗数据
from sklearn.pipeline import FeatureUnion

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
        ("cat_onehot_pipeline",cat_onehot_pipeline)
    ])

df_select_prepared = full_pipeline.fit_transform(Xtrain) # 使用流水线清洗数据，对df_train进行etl

df_select_prepared.shape # 查看清洗后的数据维度（行列）

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  arr = np.zeros_like(X, dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  X = check_array(X, dtype=np.object, accept_sparse='csc', copy=True)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  X = check_array(X, accept_sparse='csc', dtype=np.object, copy=True)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  X_int = np.zeros_like(X, dtype=np.int) # 构建一个和 X 维度相同的(一毛一样)  整数零矩阵
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  X_mask = np.ones_like(X, dtype=np.bool) # 构建一个和 X 维度相同的(一毛一样)   布尔矩阵


(219655, 121)

Feature Selection