In [38]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
import config
import os


## Load Files

In [39]:
train_filename = os.path.join(config.base_path, "data","train.xlsx")
test_filename = os.path.join(config.base_path, "data","测试A.xlsx")
train_raw = pd.read_excel(train_filename) #训练数据
test_raw = pd.read_excel(test_filename) #测试数据
train_data = train_raw.iloc[:,1:] 
train_data.index = train_raw.iloc[:,0].values
test_data = test_raw.iloc[:,1:]
test_data.index = test_raw.iloc[:,0].values

print("---number of train sample---")
print(train_data.shape[0])
print("---number of test sample---")
print(test_data.shape[0])
print("---number of features---")
print(test_data.shape[1])

---number of train sample---
486
---number of test sample---
100
---number of features---
8027


## fillna with mean

In [40]:
mean_col = train_data.mean()
train_data_fillna = train_data.fillna(mean_col)
mean_col = test_data.mean()
test_data_fillna = test_data.fillna(mean_col)

## onehot编码，删除一些重复的特征和时间戳

In [41]:
selected_features = [] #保留的特征
dropped_features = [] #删除的特征
for col in test_data_fillna.columns:
    # 编码
    x = np.hstack((train_data_fillna[col].values, test_data_fillna[col].values))
    x_set_list = list(set(x))
    # if col == '210X24':
    #     print(x_set_list[0] / 10000000000)
    #     exit()
    if isinstance(x_set_list[0], str):
        encode_dict = {}
        for i in range(len(x_set_list)):
            encode_dict[x_set_list[i]] = i
        c_train = train_data_fillna[col]
        for i in range(len(c_train)):
            c_train.iat[i] = encode_dict[c_train.iat[i]]
        c_test = test_data_fillna[col]
        for i in range(len(c_test)):
            c_test.iat[i] = encode_dict[c_test.iat[i]]
    
    #去重和去时间戳
    x = train_data_fillna[col].values
    x_set_list = list(set(x))

    if len(x_set_list) >= 2 and not all([True if str(n) == "nan" else False for n in x]):
        # if col == "210X24":
        #     print(x_set_list[:20])
        #     print(str(x_set_list[0]))
        #     print([str(e).startswith("2017") or str(e).startswith("2016") for e in x_set_list[:20]])
        if col == "520X171":
            dropped_features.append(col)
        elif not all([str(e).startswith("2017") or str(e).startswith("2016") for e in x_set_list[:20]]):
            selected_features.append(col)
        else:
            dropped_features.append(col)
    else:
        dropped_features.append(col)
df_train = train_data_fillna.loc[:, selected_features + ['Y']]
df_test = test_data_fillna.loc[:, selected_features]

print("---number of features---")
print(len(selected_features))

---number of features---
6845


## 按照相关系数筛选

In [42]:
corr_values = []
k = 3000
for col in df_test.columns:
    corr_values.append(abs(pearsonr(df_train[col].values,df_train['Y'])[0]))
corr_df = pd.DataFrame({'col':df_test.columns, 'corr_value':corr_values})
corr_df = corr_df.sort_values(by='corr_value',ascending=False)
selected = corr_df['col'].values[:k]

df_train_corr = df_train.loc[:, list(selected) + ['Y']]
df_test_corr = df_test.loc[:, list(selected)]

## 按照树模型筛选

In [43]:
k = 100
X_train = df_train.values[:, 0:-1]
Y_train = df_train.values[:,-1]
X_test = df_test.values[:,:]
# x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, random_state = 1024, test_size=0.1)

# reg = GradientBoostingRegressor(random_state = 0,
#                                 learning_rate = 0.1,
#                                 n_estimators= 29,
#                                 min_samples_split=2,
#                                 min_samples_leaf=5,
#                                 max_features=0.81,
#                                 subsample= 0.8,
#                                 max_depth= 6,
# )
reg = XGBRegressor(random_state = 0,
                    learning_rate = 0.1,
                    n_estimators= 49,
                    subsample= 0.78,
                    colsample_bytree= 0.62,
                    max_depth= 3,
)
reg.fit(X_train, Y_train)
sorted_imp = sorted(reg.feature_importances_, reverse=True)
k = sorted_imp.index(0) - 1
print("---Xgboost feature importance: %dth---" %k)
print(sorted_imp[k])
importance_df = pd.DataFrame({'col':df_test.columns, 'importance':reg.feature_importances_})
importance_df = importance_df.sort_values(by="importance", ascending=False)
selected = importance_df['col'].values[:k + 1]
df_train_tree = df_train.loc[:, list(selected) + ['Y']]
df_test_tree = df_test.loc[:, list(selected)]

---Xgboost feature importance: 232th---
0.00369004


In [44]:
after_file = os.path.join(config.base_path, "data", "tree_feature_selected_xgboost_train_A.xlsx")
with pd.ExcelWriter(after_file) as writer:
    df_train_tree.to_excel(writer,sheet_name = "train_data")
    df_test_tree.to_excel(writer, sheet_name = "test_data")