In [37]:
import pandas as pd
import numpy as np
from scipy.stats import skew

import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import validation_curve
from sklearn.model_selection import learning_curve
from sklearn import metrics
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.grid_search import GridSearchCV
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
import lightgbm as lgb

from sklearn.model_selection import KFold

from sklearn.metrics import r2_score

from sklearn.model_selection import LeaveOneOut

In [27]:
train_df = pd.read_csv('../data/original/train.csv', index_col=0)
test_df = pd.read_csv('../data/original/test.csv', index_col=0)

In [28]:

y_train_df = pd.DataFrame(train_df['y'])

In [3]:
# smooth = 5
# y_train_df = pd.DataFrame(train_df['y'])
# y = train_df['y']
# y = np.log(y + smooth)
# y_mean = np.mean(y)
# y = y-y_mean
# y_train_df["y"] = y

In [4]:
train_df.drop(['y'], axis=1, inplace=True)

In [5]:
def munge(df):
    all_df = pd.DataFrame(df.values, index=df.index, columns=df.columns, copy=True)
    all_df.drop(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8'], axis=1, inplace=True)
    
    
    #删除取值相同的特征
    all_df.drop(['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290' ,'X293' ,'X297', 'X330' ,'X347'], axis=1, inplace=True)
    
    #构造新特征
    all_df['parts'] = all_df.sum(axis=1)
    return all_df

In [6]:
munged_train_df = munge(train_df)
munged_test_df = munge(test_df)

In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

scaler.fit(munged_train_df['parts'])

scaled = scaler.transform(munged_train_df['parts'])
munged_train_df['parts'] = scaled

scaled = scaler.transform(munged_test_df['parts'])
munged_test_df['parts'] = scaled



In [8]:
# Convert categorical features using one-hot encoding.
def onehot(onehot_df, df, column_name, fill_na):
    onehot_df[column_name] = df[column_name]
    if fill_na is not None:
        onehot_df[column_name].fillna(fill_na, inplace=True)

    dummies = pd.get_dummies(onehot_df[column_name], prefix = column_name)
    
    onehot_df = onehot_df.join(dummies)
    onehot_df = onehot_df.drop([column_name], axis=1)
    return onehot_df

def munge_onehot(df):
    onehot_df = pd.DataFrame(index = df.index)

    onehot_df = onehot(onehot_df, df, "X0", None)
    onehot_df = onehot(onehot_df, df, "X1", None)
    onehot_df = onehot(onehot_df, df, "X2", None)
    onehot_df = onehot(onehot_df, df, "X3", None)
    onehot_df = onehot(onehot_df, df, "X4", None)
    onehot_df = onehot(onehot_df, df, "X5", None)
    onehot_df = onehot(onehot_df, df, "X6", None)
    onehot_df = onehot(onehot_df, df, "X8", None)
    
    return onehot_df

In [9]:
onehot_df = munge_onehot(train_df)
munged_train_df = munged_train_df.join(onehot_df)

In [10]:
onehot_df = munge_onehot(test_df)
munged_test_df = munged_test_df.join(onehot_df)

In [11]:
set(munged_test_df) - set(munged_train_df)

{'X0_ae',
 'X0_ag',
 'X0_an',
 'X0_av',
 'X0_bb',
 'X0_p',
 'X2_ab',
 'X2_ad',
 'X2_aj',
 'X2_ax',
 'X2_u',
 'X2_w',
 'X5_a',
 'X5_b',
 'X5_t',
 'X5_z'}

In [12]:
#删除test中有的  而train中没有的
munged_test_df.drop(['X0_ae', 'X0_ag', 'X0_an', 'X0_av', 'X0_bb', 'X0_p',
                     'X2_ab', 'X2_ad', 'X2_aj', 'X2_ax', 'X2_u', 'X2_w', 'X5_a', 'X5_b', 'X5_t', 'X5_z'], axis=1, inplace=True)

In [13]:
set(munged_train_df) - set(munged_test_df)

{'X0_aa',
 'X0_ab',
 'X0_ac',
 'X0_q',
 'X2_aa',
 'X2_ar',
 'X2_c',
 'X2_l',
 'X2_o',
 'X5_u'}

In [14]:
#删除train中有的  而test中没有的
munged_train_df.drop(['X0_aa', 'X0_ab', 'X0_ac', 'X0_q', 'X2_aa', 'X2_ar', 'X2_c', 'X2_l', 'X2_o', 'X5_u'], axis=1, inplace=True)

In [15]:
#删除一些占比非常不平衡的特征

In [16]:
munged_train_df['X10'].value_counts()[0]

4153

In [17]:
s = munged_train_df.shape[0]
drop_names = []
for c in munged_train_df.drop(['parts'], axis=1).columns:
    a = munged_train_df[c].value_counts()[0] / s
    b = munged_train_df[c].value_counts()[1] / s
    if (a < 0.05 or b < 0.05):
        drop_names.append(c)

In [18]:
len(drop_names)

373

In [19]:
munged_train_df.drop(drop_names, axis=1, inplace=True)
munged_test_df.drop(drop_names, axis=1, inplace=True)

In [20]:
from sklearn.decomposition import PCA, FastICA

In [29]:
def KFoldCV(model, num_fold, X_train, y_train):
    cv_results = []
    kf = KFold(n_splits=num_fold, random_state=27)
    for train_index, test_index in kf.split(X_train):
        X_sub_train = X_train[train_index]
        X_sub_test = X_train[test_index]
        y_sub_train = y_train[train_index]
        y_sub_test = y_train[test_index]
        model.fit(X_sub_train, y_sub_train)
        y_pred = model.predict(X_sub_test)
#         score = r2_score(np.exp(y_sub_test + y_mean) - smooth, np.exp(y_pred + y_mean) - smooth)
        score = r2_score(y_sub_test, y_pred)
        cv_results.append(score)
    print('mean:%f std:%f'%(np.mean(cv_results), np.std(cv_results)))
    return np.mean(cv_results), np.std(cv_results)

In [38]:
def LOOCV(model, X_train, y_train):
    cv_results = []
    loo = LeaveOneOut()
    for train_index, test_index in  loo.split(X_train):
        X_sub_train = X_train[train_index]
        X_sub_test = X_train[test_index]
        y_sub_train = y_train[train_index]
        y_sub_test = y_train[test_index]
        model.fit(X_sub_train, y_sub_train)
        y_pred = model.predict(X_sub_test)
#         score = r2_score(np.exp(y_sub_test + y_mean) - smooth, np.exp(y_pred + y_mean) - smooth)
        score = r2_score(y_sub_test, y_pred)
        cv_results.append(score)
    print('mean:%f std:%f'%(np.mean(cv_results), np.std(cv_results)))
    return np.mean(cv_results), np.std(cv_results)

In [48]:
y_train_df[0:3]['y'].values

array([ 130.81,   88.53,   76.26])

In [72]:
X_train, X_test, y_train, y_test = train_test_split(train_df[0:4].values, y_train_df[0:4]['y'].values, test_size=0.5, random_state=1729)
X_all_train = munged_train_df.values
y_all_train = y_train_df['y'].values

estimator = lgb.LGBMRegressor(max_depth=4, num_leaves=9, learning_rate=0.005, n_estimators=700, subsample=0.9, seed=1729)

loo = LeaveOneOut()
cv_results = []
y_tests = []
y_preds = []
i = 0
for train_index, test_index in  loo.split(X_all_train):
    X_sub_train = X_all_train[train_index]
    X_sub_test = X_all_train[test_index]
    y_sub_train = y_all_train[train_index]
    y_sub_test = y_all_train[test_index]

    estimator.fit(X_sub_train, y_sub_train)
    y_pred = estimator.predict(X_sub_test)
    
    y_tests.extend(y_sub_test)
    y_preds.extend(y_pred)
    
    i = i + 1;
    if i % 100 == 0:
        print(i)

score = r2_score(y_tests, y_preds)
print(score)
print('\n')
print('mean:%f std:%f'%(np.mean(cv_results), np.std(cv_results)))

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
[130.81, 88.530000000000001, 76.260000000000005, 80.620000000000005, 78.019999999999996, 92.930000000000007, 128.75999999999999, 91.909999999999997, 108.67, 126.98999999999999, 102.09, 98.120000000000005, 82.620000000000005, 94.120000000000005, 99.150000000000006, 93.640000000000001, 106.09999999999999, 114.13, 89.810000000000002, 90.810000000000002, 90.560000000000002, 94.569999999999993, 108.14, 120.77, 84.840000000000003, 93.590000000000003, 104.06999999999999, 89.370000000000005, 90.079999999999998, 128.19, 76.010000000000005, 107.86, 106.87, 104.84999999999999, 114.78, 91.560000000000002, 98.159999999999997, 117.31, 79.0, 110.76000000000001, 101.44, 91.980000000000004, 98.079999999999998, 139.19999999999999, 85.439999999999998, 110.52, 91.439999999999998, 91.519999999999996, 91.59000

  ret = ret.dtype.type(ret / rcount)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)


In [None]:
 r2_score(y_sub_test, y_pred)

In [30]:
def try_comp(n_pca_comp, n_ica_comp, n_grp_comp, n_srp_comp):
    pca = PCA(n_components=n_pca_comp, random_state=42)
    pca2_results_train = pca.fit_transform(munged_train_df)
    pca2_results_test = pca.transform(munged_test_df)

    #ICA
    ica = FastICA(n_components=n_ica_comp, random_state=42)
    ica2_results_train = ica.fit_transform(munged_train_df)
    ica2_results_test = ica.transform(munged_test_df)

    # GRP
    grp = GaussianRandomProjection(n_components=n_grp_comp, eps=0.1, random_state=420)
    grp_results_train = grp.fit_transform(munged_train_df)
    grp_results_test = grp.transform(munged_test_df)

    # SRP
    srp = SparseRandomProjection(n_components=n_srp_comp, dense_output=True, random_state=420)
    srp_results_train = srp.fit_transform(munged_train_df)
    srp_results_test = srp.transform(munged_test_df)
    
    src_train_df = munged_train_df.copy()
    
    # Append decomposition components to datasets
    for i in range(1, n_pca_comp+1):
        src_train_df['pca_' + str(i)] = pca2_results_train[:,i-1]
        
    for i in range(1, n_ica_comp+1):
        src_train_df['ica_' + str(i)] = ica2_results_train[:,i-1]
        
    for i in range(1, n_grp_comp+1):
        src_train_df['grp_' + str(i)] = grp_results_train[:,i-1]
        
    for i in range(1, n_srp_comp+1):
        src_train_df['srp_' + str(i)] = srp_results_train[:,i-1]
        
    print('pca:%d  ica:%d  grp:%d srp:%d'%(n_pca_comp, n_ica_comp, n_grp_comp, n_srp_comp))
    return train_lightgbm(src_train_df)

In [31]:
def try_comp2(n_pca_comp, n_ica_comp):
    pca = PCA(n_components=n_pca_comp, random_state=42)
    pca2_results_train = pca.fit_transform(munged_train_df)
    pca2_results_test = pca.transform(munged_test_df)

    #ICA
    ica = FastICA(n_components=n_ica_comp, random_state=42)
    ica2_results_train = ica.fit_transform(munged_train_df)
    ica2_results_test = ica.transform(munged_test_df)

    
    src_train_df = munged_train_df.copy()
    
    # Append decomposition components to datasets
    for i in range(1, n_pca_comp+1):
        src_train_df['pca_' + str(i)] = pca2_results_train[:,i-1]
        
    for i in range(1, n_ica_comp+1):
        src_train_df['ica_' + str(i)] = ica2_results_train[:,i-1]
        
        
    print('pca:%d  ica:%d'%(n_pca_comp, n_ica_comp))
    return train_lightgbm(src_train_df)

In [39]:
def train_lightgbm(train_df):
    
    X_train, X_test, y_train, y_test = train_test_split(train_df.values, y_train_df['y'].values, test_size=0.5, random_state=1729)
    X_all_train = train_df.values
    y_all_train = y_train_df['y'].values
    
    estimator = lgb.LGBMRegressor(max_depth=4, num_leaves=9, learning_rate=0.005, n_estimators=700, subsample=0.9, seed=1729)

#     param_grid = {
#         'num_leaves':[9, 10, 11],
#         'learning_rate': [0.005, 0.01],
#         'n_estimators': [500, 700],
#         'subsample':[0.90, 0.95]

#     }

#     gbm = GridSearchCV(estimator, param_grid)

#     gbm.fit(X_train, y_train)
#     y_pred = gbm.predict(X_test)

#     print('Best parameters found by grid search are:', gbm.best_params_)
#     print('Best score:%f'%(r2_score(y_test, y_pred)))
    
#     estimator.set_params(**gbm.best_params_)

    return LOOCV(estimator, X_all_train, y_all_train)
    #return KFoldCV(estimator, 5, X_all_train, y_all_train)

In [25]:
max_mean = 0
min_std = 10

for pca_comp in range(7,12):
    for ica_comp in range(7, 12):
        for grp_comp in range(1, 7):
            for srp_comp in range(1, 7):
                mean, std = try_comp(pca_comp, ica_comp, grp_comp, srp_comp)
                print('==============================================================')
                if max_mean < mean:
                    max_mean = mean
                if min_std > std:
                    min_std = std
                    
print('max_mean=%f min_std=%f'%(max_mean, min_std))
                    

pca:7  ica:7  grp:1 srp:1
mean:0.559444 std:0.078171
pca:7  ica:7  grp:1 srp:2
mean:0.559383 std:0.077940
pca:7  ica:7  grp:1 srp:3
mean:0.559223 std:0.077856
pca:7  ica:7  grp:1 srp:4
mean:0.559739 std:0.077625
pca:7  ica:7  grp:1 srp:5
mean:0.559527 std:0.077878
pca:7  ica:7  grp:1 srp:6
mean:0.559019 std:0.077659
pca:7  ica:7  grp:2 srp:1
mean:0.559407 std:0.078249
pca:7  ica:7  grp:2 srp:2
mean:0.559307 std:0.077859
pca:7  ica:7  grp:2 srp:3
mean:0.559132 std:0.077897
pca:7  ica:7  grp:2 srp:4
mean:0.559609 std:0.077506
pca:7  ica:7  grp:2 srp:5
mean:0.559350 std:0.077816
pca:7  ica:7  grp:2 srp:6
mean:0.558910 std:0.077641
pca:7  ica:7  grp:3 srp:1
mean:0.559603 std:0.078035
pca:7  ica:7  grp:3 srp:2
mean:0.559438 std:0.077852
pca:7  ica:7  grp:3 srp:3
mean:0.559283 std:0.077743
pca:7  ica:7  grp:3 srp:4
mean:0.559614 std:0.077582
pca:7  ica:7  grp:3 srp:5
mean:0.559413 std:0.077726
pca:7  ica:7  grp:3 srp:6
mean:0.559054 std:0.077589
pca:7  ica:7  grp:4 srp:1
mean:0.559280 std:0.

In [40]:
mean, std = try_comp(7, 10, 6, 5)

pca:7  ica:10  grp:6 srp:5
mean:0.000000 std:0.000000


In [96]:
pca:7  ica:10  grp:6 srp:5
mean:0.559100 std:0.071155

4.653474409882869

In [34]:
def comp_result(n_pca_comp, n_ica_comp, n_grp_comp, n_srp_comp):
    pca = PCA(n_components=n_pca_comp, random_state=42)
    pca2_results_train = pca.fit_transform(munged_train_df)
    pca2_results_test = pca.transform(munged_test_df)

    #ICA
    ica = FastICA(n_components=n_ica_comp, random_state=42)
    ica2_results_train = ica.fit_transform(munged_train_df)
    ica2_results_test = ica.transform(munged_test_df)

    # GRP
    grp = GaussianRandomProjection(n_components=n_grp_comp, eps=0.1, random_state=420)
    grp_results_train = grp.fit_transform(munged_train_df)
    grp_results_test = grp.transform(munged_test_df)

    # SRP
    srp = SparseRandomProjection(n_components=n_srp_comp, dense_output=True, random_state=420)
    srp_results_train = srp.fit_transform(munged_train_df)
    srp_results_test = srp.transform(munged_test_df)
    
    src_train_df = munged_train_df.copy()
    src_test_df = munged_test_df.copy()
    # Append decomposition components to datasets
    for i in range(1, n_pca_comp+1):
        src_train_df['pca_' + str(i)] = pca2_results_train[:,i-1]
        src_test_df['pca_' + str(i)] = pca2_results_test[:, i-1]
        
    for i in range(1, n_ica_comp+1):
        src_train_df['ica_' + str(i)] = ica2_results_train[:,i-1]
        src_test_df['ica_' + str(i)] = ica2_results_test[:, i-1]
        
    for i in range(1, n_grp_comp+1):
        src_train_df['grp_' + str(i)] = grp_results_train[:,i-1]
        src_test_df['grp_' + str(i)] = grp_results_test[:, i-1]
        
    for i in range(1, n_srp_comp+1):
        src_train_df['srp_' + str(i)] = srp_results_train[:,i-1]
        src_test_df['srp_' + str(i)] = srp_results_test[:, i-1]
        
    src_train_df.to_csv('../data/offline/train.csv')
    src_test_df.to_csv('../data/offline/test.csv')
    y_train_df.to_csv('../data/offline/y_train.csv')

In [35]:
comp_result(10, 10, 6, 5)