In [1]:
import numpy as np
import pandas as pd

In [2]:
path = '/Users/xuweikang/Desktop/data/kaggle/Santander Value Prediction Challenge/'

In [3]:
train = pd.read_csv(path + 'train.csv')
test  = pd.read_csv(path + 'test.csv')

### 分解数据集

In [4]:
y_train = train['target']
train.drop(['ID','target'], axis=1, inplace=True)

### 找到并删除变量唯一的特征

In [7]:
cols_with_onlyone_val = train.columns[train.nunique() == 1]

In [8]:
cols_with_onlyone_val

Index(['d5308d8bc', 'c330f1a67', 'eeac16933', '7df8788e8', '5b91580ee',
       '6f29fbbc7', '46dafc868', 'ae41a98b6', 'f416800e9', '6d07828ca',
       ...
       'd196ca1fd', 'a8e562e8e', 'eb6bb7ce1', '5beff147e', '52b347cdc',
       '4600aadcf', '6fa0b9dab', '43d70cc4d', '408021ef8', 'e29d22b59'],
      dtype='object', length=256)

In [9]:
train.drop(cols_with_onlyone_val, axis=1, inplace=True)
test.drop(cols_with_onlyone_val, axis=1, inplace=True)

### 改变精度

In [10]:
NUM_OF_DECIMALS = 32
train = train.round(NUM_OF_DECIMALS)
test = test.round(NUM_OF_DECIMALS)

### 遍历所有的特征，如果两个特征取值完全一样，就删除

In [11]:
colsToRmove = []
for i in range(len(train.columns) -1 ):
    v = train[train.columns[i]].values
    for j in range(i+1, len(train.columns)):
        if np.array_equal(v, train[train.columns[j]].values):
            colsToRmove.append(train.columns[i])

In [12]:
train.shape

(4459, 4735)

In [13]:
len(colsToRmove)

6

In [14]:
colsToRmove

['34ceb0081', '8d57e2749', '8d57e2749', '168b3e5bc', 'a765da8bc', 'acc5b709d']

In [15]:
colsToRemove = []
columns = train.columns
for i in range(len(columns)-1):
    v = train[columns[i]].values
    dupCols = []
    for j in range(i + 1,len(columns)):
        if np.array_equal(v, train[columns[j]].values):
            colsToRemove.append(columns[j])

In [16]:
colsToRemove

['d60ddde1b', 'acc5b709d', 'f333a5f60', 'f8d75792f', '912836770', 'f333a5f60']

In [17]:
colsToRemove = []
columns = train.columns
for i in range(len(columns)-1):
    v = train[columns[i]].values
    dupCols = []
    for j in range(i + 1,len(columns)):
        if np.array_equal(v, train[columns[j]].values):
            colsToRemove.append(columns[j])

In [18]:
colsToRemove

['d60ddde1b', 'acc5b709d', 'f333a5f60', 'f8d75792f', '912836770', 'f333a5f60']

In [19]:
train.drop(colsToRemove, axis=1, inplace=True) 
test.drop(colsToRemove, axis=1, inplace=True)

In [20]:
train.shape

(4459, 4730)

### 用随机森林选特征

In [21]:
from sklearn import model_selection
from sklearn import ensemble
NUM_OF_FEATURES = 1000
def rmsle(y, pred):
    return np.sqrt(np.mean(np.power(np.log1p(y)-np.log1p(pred), 2)))

x1, x2, y1, y2 = model_selection.train_test_split(
    train, y_train.values, test_size=0.20, random_state=5)
model = ensemble.RandomForestRegressor(n_jobs=-1, random_state=7)
model.fit(x1, y1)
print(rmsle(y2, model.predict(x2)))

col = pd.DataFrame({'importance': model.feature_importances_, 'feature': train.columns}).sort_values(
    by=['importance'], ascending=[False])[:NUM_OF_FEATURES]['feature'].values
train = train[col]
test = test[col]
train.shape

1.7698027896992665


(4459, 1000)

### 用KS检验删除列

In [22]:
from scipy.stats import ks_2samp
THRESHOLD_P_VALUE = 0.01 #need tuned
THRESHOLD_STATISTIC = 0.2 #need tuned
diff_cols = []
for col in train.columns:
    statistic, pvalue = ks_2samp(train[col].values, test[col].values)
    if pvalue <= THRESHOLD_P_VALUE and np.abs(statistic) > THRESHOLD_STATISTIC:
        diff_cols.append(col)
for col in diff_cols:
    if col in train.columns:
        train.drop(col, axis=1, inplace=True)
        test.drop(col, axis=1, inplace=True)
train.shape

(4459, 982)

In [26]:
weight = ((train != 0).sum()/len(train))

In [27]:
weight

f190486d6    0.346266
eeb9cd3aa    0.339089
58e2e02e6    0.339314
58232a6fb    0.333034
6d2ece683    0.079166
429687d5a    0.080511
134ac90df    0.073783
15ace8c9f    0.333034
4edc3388d    0.029379
6eef030c1    0.326306
e13b0c0aa    0.130523
324921c7b    0.337295
2e103d632    0.165508
9c42bff81    0.076475
cd24eae8a    0.158107
4ecc3f505    0.075577
f32763afc    0.021978
bf6e38e39    0.025566
13ee58af1    0.169993
4a39584e5    0.045077
1702b5bf0    0.336398
899dbe405    0.070195
cd7f0affd    0.128504
09184c121    0.052702
f14b57b8f    0.079166
57dd44c29    0.156762
d79736965    0.080287
b43a7cfd5    0.322494
9fd594eec    0.340884
ca25aad9f    0.018838
               ...   
1ea08665c    0.007625
d3022e2f1    0.019960
eb7981dd4    0.026015
ea4046b8d    0.175824
ce8ce671e    0.128280
940151347    0.006279
215ffb087    0.021529
6e29e9500    0.071765
c6850e7db    0.084548
26c68cede    0.084997
22c7b00ef    0.120655
17a6e2978    0.005831
53102b93f    0.122225
2ef57c650    0.063691
8f76eb6e5 

In [28]:
temp_train = train[train!=0]

In [29]:
temp_train.mean(axis=1)

0       7.376393e+06
1       6.588050e+06
2       4.981818e+06
3       1.426205e+06
4       6.899727e+06
5       9.322040e+06
6       1.957776e+06
7       2.182011e+07
8       1.610358e+06
9       2.366693e+06
10      3.503614e+07
11      2.417660e+06
12      7.101269e+05
13      9.922492e+06
14      1.283000e+06
15      4.815225e+06
16      7.638360e+05
17      5.145680e+06
18      5.064715e+06
19      2.558179e+06
20      3.465757e+06
21      5.751370e+06
22      6.888135e+06
23      1.421114e+07
24      2.076867e+06
25      9.864217e+05
26      9.390385e+06
27      6.308819e+06
28      4.574632e+06
29      9.080706e+06
            ...     
4429    3.094561e+06
4430    2.292353e+07
4431    4.656670e+06
4432    1.828571e+07
4433    5.807678e+06
4434    7.257533e+05
4435    8.832632e+06
4436    4.822671e+06
4437    4.782857e+06
4438    5.563244e+06
4439    4.690648e+06
4440    5.438905e+06
4441    3.286364e+06
4442    1.030738e+07
4443    4.355818e+06
4444    1.413111e+06
4445    2.716

In [30]:
temp_train.shape

(4459, 982)