# 传统机器学习方法
路线：前置准备 预处理 可视化 特征提取 特征选择 分类器

In [1]:
import os
import numpy as np
import pandas as pd
from scipy.signal import butter, filtfilt, iirnotch, lfilter
from copy import deepcopy
from entropy import *
import entropy
from load_lyh_data import  *

## 0. 前置准备
加载lyh的数据，这里仅仅是加载数据对数据进行分割，并未对数据进行任何处理

In [2]:
raw_data_dict = read_folder_npy_data()

Loaded nothing1_orginal.npy: Shape=(100,), Dtype=object
Loaded nothing2_orginal.npy: Shape=(100,), Dtype=object
Loaded nothing3_orginal.npy: Shape=(100,), Dtype=object
Loaded left1_orginal.npy: Shape=(100,), Dtype=object
Loaded left2_orginal.npy: Shape=(100,), Dtype=object
Loaded left3_orginal.npy: Shape=(100,), Dtype=object
Loaded right1_orginal.npy: Shape=(100,), Dtype=object
Loaded right2_orginal.npy: Shape=(100,), Dtype=object
Loaded right3_orginal.npy: Shape=(100,), Dtype=object
Loaded leg1_orginal.npy: Shape=(100,), Dtype=object
Loaded leg2_orginal.npy: Shape=(100,), Dtype=object
Loaded leg3_orginal.npy: Shape=(100,), Dtype=object


In [3]:
np.array(raw_data_dict["leg3_orginal.npy"][0]).shape

(1027, 19)

In [4]:
# 参数设计

# 下采样设置：ERP脑电有效频率<30Hz，根据脑奎斯特定理，
# 采样率到60Hz以上即可。但为了多一些保存细节，这里降采样设置为125Hz
fs_raw = 250
fs_down = 125
# 一个trial的时间
t_of_trial = 4
# 采样点个数
nTime = fs_down * t_of_trial

start_point_index = 2
end_point_index = start_point_index + fs_raw * t_of_trial


def down_sample(data, start, end, fs_raw, fs_down):
    '''
    下采样
    param---
    data: numpy数组
    return---
    data: 下采样之后的数组
    '''
    
    step = fs_raw // fs_down
    data = data[start: end: step]
    
    return data

In [5]:
test = down_sample(np.array(raw_data_dict["nothing1_orginal.npy"][0]), start_point_index, end_point_index, fs_raw, fs_down)
test.shape

(500, 19)

In [6]:
# 选择channel
channel_index_list = np.arange(2, 2+14)

def selected_channel(data, channel_index_list):

    return data[:, channel_index_list]

In [7]:
test = selected_channel(test, channel_index_list)
test.shape

(500, 14)

In [8]:
file_label_dict = {
    "nothing1_orginal.npy": 0, "nothing2_orginal.npy": 0, "nothing3_orginal.npy": 0,
    "left1_orginal.npy": 1, "left2_orginal.npy": 1, "left3_orginal.npy": 1,
    "right1_orginal.npy": 2, "right2_orginal.npy": 2, "right3_orginal.npy": 2,
    "leg1_orginal.npy": 3, "leg2_orginal.npy": 3, "leg3_orginal.npy": 3
}
save_path = os.getcwd() + "/lyh_data/Standard_input"

def pre_preparation(data_dict, file_label_dict, save_path, nClasses=4):
    data = [[] for _ in range(nClasses)]

    for key, value in data_dict.items():
        d = data[file_label_dict[key]]
        for trial in value:
            trial = down_sample(np.array(trial), start_point_index, end_point_index, fs_raw, fs_down)
            trail = selected_channel(trial, channel_index_list)
            d.append(trail.T)
    
    # save
    for i, d in enumerate(data):
        data[i] = np.array(d)
        path = save_path + "/" + str(i) + ".npy"
        np.save(path, data[i])

    return data

In [9]:
standard_input_data_list = pre_preparation(raw_data_dict, file_label_dict, save_path)

In [10]:
standard_input_data_list[0].shape

(300, 14, 500)

## 1. 预处理

In [11]:
# 基线校正
def baseline_correction(data, baseline_start, baseline_end):
    """
    进行基线校正

    Parameters:
    - data: EEG 数据的二维数组，shape=(通道数, 采样点数)
    - baseline_start: 基线起始点的索引
    - baseline_end: 基线结束点的索引

    Returns:
    - baseline_corrected_data: 基线校正后的 EEG 数据
    """
    # 计算每个通道上基线期的平均值
    baseline_values = np.mean(data[:, baseline_start:baseline_end], axis=1, keepdims=True)

    # 进行基线校正
    baseline_corrected_data = data - baseline_values

    return baseline_corrected_data

In [12]:
# 作为基线的比例
baseline_ratio = 0.1
baseline_start = 0
baseline_end = int(0.1 * nTime)

x = deepcopy(standard_input_data_list[0][0])
print("基线校正前 : ", x)
x = baseline_correction(x, baseline_start, baseline_end)
print("基线校正后 : ", x)

基线校正前 :  [[4298.333 4301.538 4307.564 ... 4320.    4332.436 4314.872]
 [4268.462 4269.103 4268.462 ... 4282.821 4301.41  4287.051]
 [4328.462 4324.103 4328.333 ... 4351.795 4355.897 4346.667]
 ...
 [4463.077 4457.821 4469.615 ... 4502.308 4510.128 4505.897]
 [4335.641 4342.179 4357.308 ... 4426.795 4437.692 4419.231]
 [4418.846 4415.769 4421.154 ... 4469.615 4474.231 4469.744]]
基线校正后 :  [[ 11.42274  14.62774  20.65374 ...  33.08974  45.52574  27.96174]
 [ 23.91832  24.55932  23.91832 ...  38.27732  56.86632  42.50732]
 [ 14.37734  10.01834  14.24834 ...  37.71034  41.81234  32.58234]
 ...
 [  0.56158  -4.69442   7.09958 ...  39.79258  47.61258  43.38158]
 [-26.3051  -19.7671   -4.6381  ...  64.8489   75.7459   57.2849 ]
 [-10.63602 -13.71302  -8.32802 ...  40.13298  44.74898  40.26198]]


In [13]:
# 带通滤波器 0.05Hz-40Hz
lowcut = 0.05
highcut = 40
fs = fs_down
order = 2

def butter_bandpass_filter(data, lowcut, highcut, fs, order):
    fa = 0.5 * fs
    low = lowcut / fa
    high = highcut / fa
    b, a = butter(order, [low, high], btype='band')
    ret = []
    for line in data:
        ret.append(filtfilt(b, a, line))
    return np.array(ret)

def iirnotch_filter(data, fs = 125, Q = 30, f_cut = 50.0):
    ret = []
    b, a = iirnotch(f_cut, Q, fs)
    for line in data:
        ret.append(lfilter(b,a, line))
    return np.array(ret)


In [14]:
print("滤波前 : ", x)
x = butter_bandpass_filter(x, lowcut, highcut, fs, order)
x = iirnotch_filter(x)
print("滤波后 : ", x)

滤波前 :  [[ 11.42274  14.62774  20.65374 ...  33.08974  45.52574  27.96174]
 [ 23.91832  24.55932  23.91832 ...  38.27732  56.86632  42.50732]
 [ 14.37734  10.01834  14.24834 ...  37.71034  41.81234  32.58234]
 ...
 [  0.56158  -4.69442   7.09958 ...  39.79258  47.61258  43.38158]
 [-26.3051  -19.7671   -4.6381  ...  64.8489   75.7459   57.2849 ]
 [-10.63602 -13.71302  -8.32802 ...  40.13298  44.74898  40.26198]]
滤波后 :  [[  1.2233674    5.31136275   9.38478621 ...  23.26321155  28.70143919
   16.27871029]
 [  4.62252065   4.89188273   5.60975696 ...   7.82624006  23.85918253
   12.7586109 ]
 [ -1.11384209  -4.53790032  -1.92974007 ...  11.24382281  13.61377979
    5.92651266]
 ...
 [  0.54598765  -3.07385892   6.49437274 ...  -7.09438025  -0.34693384
   -3.45731556]
 [-11.19020058  -4.24634911   8.3426814  ...  37.09434085  42.91027207
   28.24304665]
 [ -4.26287477  -6.92866165  -2.46119829 ...   3.37859591   5.17940678
    2.14527731]]


In [15]:
save_path = os.getcwd() + "/lyh_data/Preprocessed_data"

def pre_processing(data, save_path, butter_order = 2):

    for data_per_class in data:
        for i, d in enumerate(data_per_class):
            assert d.shape == (14, 500)
            # data_per_class[i] = baseline_correction(data_per_class[i], baseline_start, baseline_end)
            data_per_class[i] = butter_bandpass_filter(data_per_class[i], lowcut, highcut, fs, order)
            data_per_class[i] = iirnotch_filter(data_per_class[i])
    
    # save
    for i, d in enumerate(data):
        data[i] = np.array(d)
        path = save_path + "/" + str(i) + ".npy"
        np.save(path, data[i])

In [16]:
pre_processing(standard_input_data_list, save_path)
print("滤波后: ", standard_input_data_list[0][0])

滤波后:  [[  1.2233674    5.31136274   9.3847862  ...  23.26321155  28.70143919
   16.27871029]
 [  4.62252064   4.89188272   5.60975696 ...   7.82624006  23.85918253
   12.7586109 ]
 [ -1.1138421   -4.53790033  -1.92974008 ...  11.24382281  13.61377978
    5.92651266]
 ...
 [  0.54598764  -3.07385893   6.49437274 ...  -7.09438025  -0.34693384
   -3.45731556]
 [-11.19020058  -4.24634911   8.3426814  ...  37.09434085  42.91027207
   28.24304665]
 [ -4.26287477  -6.92866165  -2.46119829 ...   3.37859591   5.17940678
    2.14527731]]


## 2. 特征提取


### 2.1 时域特征提取

In [17]:
len(standard_input_data_list), len(standard_input_data_list[0])

(4, 300)

In [18]:
dataset_np = np.array(standard_input_data_list)
dataset_np = dataset_np.reshape(-1, 14, 500)
dataset_np.shape

(1200, 14, 500)

In [19]:
dataset_list = list(dataset_np)
label_list = [0] * 300 + [1] * 300 + [2] * 300 + [3] * 300
data_df = pd.DataFrame({'raw data': dataset_list, 'label': label_list})
x = deepcopy(data_df.iloc[0])

In [20]:
# 计算过零率

def zero_crossing_rate(trials):
    '''
    计算一个trials(二维列表)各个通道的过零率, 返回一个list
    '''

    def compute(signal):
        '''
        计算一维信号的过零率
        '''
        crossings = np.where(np.diff(np.sign(signal)))[0]
        zero_crossing_rate = len(crossings) / len(signal)
        return zero_crossing_rate

    ret = [compute(trials[i]) for i in range(len(trials))]
    return np.array(ret)

In [21]:
# 同一类别的过零率差别大
np.array(zero_crossing_rate(x['raw data'])),\
     np.array(zero_crossing_rate(data_df.iloc[1]['raw data'])),\
       np.array(zero_crossing_rate(data_df.iloc[2]['raw data'])  )

(array([0.168, 0.04 , 0.13 , 0.102, 0.198, 0.15 , 0.14 , 0.186, 0.126,
        0.074, 0.106, 0.114, 0.062, 0.046]),
 array([0.15 , 0.088, 0.156, 0.084, 0.108, 0.112, 0.174, 0.184, 0.18 ,
        0.136, 0.108, 0.184, 0.08 , 0.104]),
 array([0.138, 0.1  , 0.078, 0.07 , 0.05 , 0.154, 0.184, 0.09 , 0.172,
        0.094, 0.12 , 0.124, 0.062, 0.112]))

In [22]:
# 同一类别的过零率差别大
np.array(zero_crossing_rate(data_df.iloc[398]['raw data'])),\
     np.array(zero_crossing_rate(data_df.iloc[399]['raw data'])),\
       np.array(zero_crossing_rate(data_df.iloc[400]['raw data'])  )

(array([0.056, 0.084, 0.112, 0.106, 0.05 , 0.066, 0.062, 0.17 , 0.136,
        0.076, 0.128, 0.138, 0.1  , 0.084]),
 array([0.106, 0.056, 0.098, 0.054, 0.028, 0.04 , 0.06 , 0.104, 0.076,
        0.064, 0.032, 0.068, 0.046, 0.076]),
 array([0.012, 0.008, 0.016, 0.054, 0.026, 0.014, 0.018, 0.058, 0.05 ,
        0.042, 0.054, 0.032, 0.034, 0.048]))

In [23]:
# 计算均值

def calculate_channel_means(eeg_data):
    """
    计算每个通道的均值。

    参数：
    - eeg_data: 二维 NumPy 数组，表示 EEG 数据，形状为 (n_channels, n_points)。

    返回：
    - channel_means: 一维 NumPy 数组，包含每个通道的均值。
    """
    # 计算每个通道的均值，axis=2 表示在样本点上求均值
    channel_means = np.mean(eeg_data, axis=1)

    return channel_means


In [24]:
# 同一类别的trials均值差别怎么这么大？
calculate_channel_means(data_df.iloc[0]['raw data']), calculate_channel_means(data_df.iloc[1]['raw data'])

(array([  5.79579612,  -6.37368547,  -1.11140977, -10.55618282,
          2.01619029,  -1.79062357,  -1.40401937,   0.80076936,
          3.03901338,   4.12667989,  11.40424161,  -0.07762688,
         26.94610349,  -1.1365759 ]),
 array([ -7.05026265, -18.51313891,  -5.52623322,  -6.18110454,
          5.75653787,  10.04083053,  -0.65157368,   0.92565608,
         -1.22593159,  -4.27781215,   6.5364136 ,   3.26128473,
          6.60940731,   5.42792899]))

In [25]:
# 计算标准差

def calculate_channel_stds(eeg_data):
    """
    计算每个通道的均值。

    参数：
    - eeg_data: 二维 NumPy 数组，表示 EEG 数据，形状为 (n_channels, n_points)。

    返回：
    - channel_means: 一维 NumPy 数组，包含每个通道的均值。
    """
    # 计算每个通道的均值，axis=2 表示在样本点上求均值
    channel_stds = np.std(eeg_data, axis=1)

    return channel_stds

In [26]:
def calculate_first_order_diff(eeg_data):

    first_order_diff = np.sum(np.abs(np.diff(eeg_data, axis=1)), axis=1) / (eeg_data.shape[1] - 1)

    return first_order_diff

def calculate_second_order_diff(eeg_data):
    
    second_order_diff = np.sum(np.abs(np.diff(eeg_data, n=2, axis=1)), axis=1) / (eeg_data.shape[1] - 2)

    return second_order_diff


In [27]:
def time_domain_feature(eeg_dataset):
    '''
    提取时域特征
    params:
    - eeg_dataset: Dataframe类型
    '''
    zero_crossing_list = []
    means_list = []
    stds_list = []
    first_order_diff_list = []
    second_order_diff_list = []

    for _, row in eeg_dataset.iterrows():
        zero_crossing = zero_crossing_rate(row['raw data'])
        zero_crossing_list.append(zero_crossing)
        means = calculate_channel_means(row['raw data'])
        means_list.append(means)
        stds = calculate_channel_stds(row['raw data'])
        stds_list.append(stds)
        first_diff = calculate_first_order_diff(row['raw data'])
        first_order_diff_list.append(first_diff)
        second_diff = calculate_second_order_diff(row['raw data'])
        second_order_diff_list.append(second_diff)
    
    nChannels = zero_crossing_list[0].shape[0]
    for i in range(nChannels):
        eeg_dataset[f'zero_crossing_rate_c{i+1}'] = np.array(zero_crossing_list)[:,i]
        eeg_dataset[f'mean_c{i+1}'] = np.array(means_list)[:,i]
        eeg_dataset[f'std_c{i+1}'] = np.array(stds_list)[:,i]
        eeg_dataset[f'first_order_diff_c{i+1}'] = np.array(first_order_diff_list)[:,i]
        eeg_dataset[f'second_order_diff_c{i+1}'] = np.array(second_order_diff_list)[:,i]
        

In [28]:
time_domain_feature(data_df)

In [29]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 72 columns):
raw data                  1200 non-null object
label                     1200 non-null int64
zero_crossing_rate_c1     1200 non-null float64
mean_c1                   1200 non-null float64
std_c1                    1200 non-null float64
first_order_diff_c1       1200 non-null float64
second_order_diff_c1      1200 non-null float64
zero_crossing_rate_c2     1200 non-null float64
mean_c2                   1200 non-null float64
std_c2                    1200 non-null float64
first_order_diff_c2       1200 non-null float64
second_order_diff_c2      1200 non-null float64
zero_crossing_rate_c3     1200 non-null float64
mean_c3                   1200 non-null float64
std_c3                    1200 non-null float64
first_order_diff_c3       1200 non-null float64
second_order_diff_c3      1200 non-null float64
zero_crossing_rate_c4     1200 non-null float64
mean_c4                   1200 n

### 2.2 频域特征提取

In [30]:
from scipy.signal import welch

def five_band_energy(eeg_data, fs=125):
    
    energy = []
    
    for eeg_signal in eeg_data:
        # 计算功率谱密度（PSD）
        frequencies, psd = welch(eeg_signal, fs, nperseg=1024)

        # 定义频带边界
        delta_band = (0.5, 4)
        theta_band = (4, 8)
        alpha_band = (8, 14)
        beta_band = (14, 30)
        gamma_band = (30, 60)

        # 计算每个频带内的能量
        energy.append(np.trapz(psd[(frequencies >= delta_band[0]) & (frequencies <= delta_band[1])], \
            frequencies[(frequencies >= delta_band[0]) & (frequencies <= delta_band[1])]))
        energy.append(np.trapz(psd[(frequencies >= theta_band[0]) & (frequencies <= theta_band[1])], \
            frequencies[(frequencies >= theta_band[0]) & (frequencies <= theta_band[1])]))
        energy.append(np.trapz(psd[(frequencies >= alpha_band[0]) & (frequencies <= alpha_band[1])], \
            frequencies[(frequencies >= alpha_band[0]) & (frequencies <= alpha_band[1])]))
        energy.append(np.trapz(psd[(frequencies >= beta_band[0]) & (frequencies <= beta_band[1])], \
            frequencies[(frequencies >= beta_band[0]) & (frequencies <= beta_band[1])]))
        energy.append(np.trapz(psd[(frequencies >= gamma_band[0]) & (frequencies <= gamma_band[1])], \
            frequencies[(frequencies >= gamma_band[0]) & (frequencies <= gamma_band[1])]))

    return np.array(energy)


In [33]:
def freq_domain_feature(eeg_dataset):
    '''
    提取时域特征

    参数:
    - eeg_dataset: Dataframe类型
    '''
    
    energy = []

    for _, row in eeg_dataset.iterrows():
        
        energy.append(five_band_energy(row['raw data']))

    freq_names = ['delta', 'theta', 'alpha', 'beta', 'gamma']
    nFreqs = len(freq_names)
    nChannels = energy[0].shape[0] // nFreqs
    
    for i in range(nChannels):
        for j in range(nFreqs):
            eeg_dataset[f'{freq_names[j]}_c{i+1}'] = np.array(energy)[:,i*nFreqs+j]

In [34]:
freq_domain_feature(data_df)

In [35]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Columns: 142 entries, raw data to gamma_c14
dtypes: float64(140), int64(1), object(1)
memory usage: 1.3+ MB


In [63]:
save_path = os.getcwd() + "/lyh_data/Feature_extracted/lyh_data.csv"
data_df.to_csv(save_path, index=False)

## 3. 分类模型

In [59]:
# 构造数据集
from sklearn.model_selection import train_test_split

# 打乱数据集
shuffled_df = data_df.sample(frac=1, random_state=42)

# 划分训练集和测试集
train_df, test_df = train_test_split(shuffled_df, test_size=0.2, random_state=42)
y_train = train_df['label']
X_train = train_df.drop(columns=['label', 'raw data'])
y_test = test_df['label']
X_test = test_df.drop(columns=['label', 'raw data'])

In [60]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((960, 140), (960,), (240, 140), (240,))

In [38]:
from lightgbm import LGBMClassifier

lgm = LGBMClassifier()
lgm.fit(X_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [39]:
# 手工特征提取+未调参LightGBM
y_train_pred = lgm.predict(X_train)
acc_train = (y_train == y_train_pred).sum() / len(y_train)

y_test_pred = lgm.predict(X_test)
acc_test = (y_test == y_test_pred).sum() / len(y_test)
acc_train, acc_test

(1.0, 0.9125)

In [58]:
# 原始数据进行训练

shuffled_df = data_df.sample(frac=1, random_state=42)
train_df, test_df = train_test_split(shuffled_df, test_size=0.2, random_state=42)

X_train = np.array(train_df['raw data'])
X_test = np.array(test_df['raw data'])
X_train = np.vstack(X_train).reshape(-1, 14, 500)
X_test = np.vstack(X_test).reshape(-1, 14, 500)
X_train = X_train.reshape(-1, 14 * 500)
X_test = X_test.reshape(-1, 14 * 500)
y_train = np.array(train_df['label'])
y_test = np.array(test_df['label'])

lgm = LGBMClassifier()
lgm.fit(X_train, y_train)

# 原始特征+未调参LightGBM
y_train_pred = lgm.predict(X_train)
acc_train = (y_train == y_train_pred).sum() / len(y_train)

y_test_pred = lgm.predict(X_test)
acc_test = (y_test == y_test_pred).sum() / len(y_test)
acc_train, acc_test

(1.0, 0.4875)

In [61]:
from xgboost import XGBClassifier
# 原始数据进行训练

shuffled_df = data_df.sample(frac=1, random_state=42)
train_df, test_df = train_test_split(shuffled_df, test_size=0.2, random_state=42)

X_train = np.array(train_df['raw data'])
X_test = np.array(test_df['raw data'])
X_train = np.vstack(X_train).reshape(-1, 14, 500)
X_test = np.vstack(X_test).reshape(-1, 14, 500)
X_train = X_train.reshape(-1, 14 * 500)
X_test = X_test.reshape(-1, 14 * 500)
y_train = np.array(train_df['label'])
y_test = np.array(test_df['label'])

xgb = XGBClassifier()
xgb.fit(X_train, y_train)

# 原始特征+未调参xgboost
y_train_pred = xgb.predict(X_train)
acc_train = (y_train == y_train_pred).sum() / len(y_train)

y_test_pred = xgb.predict(X_test)
acc_test = (y_test == y_test_pred).sum() / len(y_test)
acc_train, acc_test

(1.0, 0.4625)

In [67]:
# 构造数据集
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 打乱数据集
shuffled_df = data_df.sample(frac=1, random_state=42)

# 划分训练集和测试集
train_df, test_df = train_test_split(shuffled_df, test_size=0.2, random_state=42)
y_train = train_df['label']
X_train = train_df.drop(columns=['label', 'raw data'])
y_test = test_df['label']
X_test = test_df.drop(columns=['label', 'raw data'])

xgb = XGBClassifier()
xgb.fit(X_train, y_train)

# 手工特征提取+未调参XGBoost
y_train_pred = xgb.predict(X_train)
# acc_train = (y_train == y_train_pred).sum() / len(y_train)
acc_train = accuracy_score(y_train, y_train_pred)

y_test_pred = xgb.predict(X_test)
# acc_test = (y_test == y_test_pred).sum() / len(y_test)
acc_test = accuracy_score(y_test, y_test_pred)

acc_train, acc_test

(1.0, 0.925)

## 4. 调参

In [76]:
default_params = xgb.get_params()
# 打印参数
print("Default Parameters:")
for param, value in default_params.items():
    print(f"{param}: {value}")

Default Parameters:
objective: multi:softprob
use_label_encoder: False
base_score: 0.5
booster: gbtree
callbacks: None
colsample_bylevel: 1
colsample_bynode: 1
colsample_bytree: 1
early_stopping_rounds: None
enable_categorical: False
eval_metric: None
gamma: 0
gpu_id: -1
grow_policy: depthwise
importance_type: None
interaction_constraints: 
learning_rate: 0.300000012
max_bin: 256
max_cat_to_onehot: 4
max_delta_step: 0
max_depth: 6
max_leaves: 0
min_child_weight: 1
missing: nan
monotone_constraints: ()
n_estimators: 100
n_jobs: 0
num_parallel_tree: 1
predictor: auto
random_state: 0
reg_alpha: 0
reg_lambda: 1
sampling_method: uniform
scale_pos_weight: None
subsample: 1
tree_method: exact
validate_parameters: 1
verbosity: None


In [98]:
from sklearn.model_selection import GridSearchCV

# 1. 确定learning_rate和n_estimators
param_test1 = {
 'learning_rate': [0.2, 0.25 ,0.300000012, 0.35],
 'n_estimators': [90, 95, 100, 105]
}

gsearch1 = GridSearchCV(estimator = XGBClassifier(), \
    param_grid = param_test1, scoring='accuracy', cv=3)
gsearch1.fit(pd.concat([X_train, X_test],axis=0), pd.concat([y_train,y_test],axis=0))
print(f"best params: {gsearch1.best_params_}\nbest score:  {gsearch1.best_score_}") 

best params: {'learning_rate': 0.35, 'n_estimators': 90}
best score:  0.8991666666666667


In [103]:
params = {
    'learning_rate': 0.35,
    'n_estimators': 90
}

model1 = XGBClassifier(**params)
model1.fit(X_train, y_train)

y_train_pred = model1.predict(X_train)
acc_train = (y_train == y_train_pred).sum() / len(y_train)

y_test_pred = model1.predict(X_test)
acc_test = (y_test == y_test_pred).sum() / len(y_test)

acc_train, acc_test

(1.0, 0.9083333333333333)

In [104]:
from sklearn.model_selection import GridSearchCV

# 2. 确定max_depth和min_child_weight
params = {
    'learning_rate': 0.35,
    'n_estimators': 90
}

param_test2 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}

gsearch2 = GridSearchCV(estimator = XGBClassifier(**param), \
    param_grid = param_test2, scoring='accuracy', cv=3)
gsearch2.fit(pd.concat([X_train, X_test],axis=0), pd.concat([y_train,y_test],axis=0))
print(f"best params: {gsearch2.best_params_}\nbest score:  {gsearch2.best_score_}") 

best params: {'max_depth': 9, 'min_child_weight': 5}
best score:  0.9033333333333333


In [105]:
params = {
    'learning_rate': 0.35,
    'n_estimators': 90,
    'max_depth': 9, 
    'min_child_weight': 5
}

model2 = XGBClassifier(**params)
model2.fit(X_train, y_train)

y_train_pred = model2.predict(X_train)
acc_train = (y_train == y_train_pred).sum() / len(y_train)

y_test_pred = model2.predict(X_test)
acc_test = (y_test == y_test_pred).sum() / len(y_test)

acc_train, acc_test

(1.0, 0.925)

In [110]:
from sklearn.model_selection import GridSearchCV

# 3. 确定gamma
params = {
    'learning_rate': 0.35,
    'n_estimators': 90,
    'max_depth': 9, 
    'min_child_weight': 5
}

param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}

gsearch3 = GridSearchCV(estimator = XGBClassifier(**param), \
    param_grid = param_test3, scoring='accuracy', cv=3)
gsearch3.fit(pd.concat([X_train, X_test],axis=0), pd.concat([y_train,y_test],axis=0))
print(f"best params: {gsearch3.best_params_}\nbest score:  {gsearch3.best_score_}") 

best params: {'gamma': 0.0}
best score:  0.8916666666666667


In [113]:
from sklearn.model_selection import GridSearchCV

# 4. 确定reg_alpha
param = {
    'learning_rate': 0.3,
    'n_estimators': 100,
    'max_depth': 6,
    'min_child_weight': 1,
    'gamma': 0.0
}

param_test4 = {
 'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05]
}

gsearch4 = GridSearchCV(estimator = XGBClassifier(**param), \
    param_grid = param_test4, scoring='accuracy', cv=3)
gsearch4.fit(pd.concat([X_train, X_test],axis=0), pd.concat([y_train,y_test],axis=0))
print(f"best params: {gsearch4.best_params_}\nbest score:  {gsearch4.best_score_}") 

best params: {'reg_alpha': 0.01}
best score:  0.8966666666666666


In [117]:
params = {
    'learning_rate': 0.3,
    'n_estimators': 100,
    'max_depth': 9, 
    'min_child_weight': 5,
    'reg_alpha': 0.01
}

model4 = XGBClassifier(**params)
model4.fit(X_train, y_train)

y_train_pred = model4.predict(X_train)
acc_train = (y_train == y_train_pred).sum() / len(y_train)

y_test_pred = model4.predict(X_test)
acc_test = (y_test == y_test_pred).sum() / len(y_test)

acc_train, acc_test

(1.0, 0.925)