In [1]:
import pandas as pd
import numpy as np
from scipy.stats import skew

import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import validation_curve
from sklearn.model_selection import learning_curve
from sklearn import metrics
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.grid_search import GridSearchCV
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
import lightgbm as lgb

from sklearn.model_selection import KFold

from sklearn.metrics import r2_score



In [62]:
train_df = pd.read_csv('../data/original/train.csv', index_col=0)
test_df = pd.read_csv('../data/original/test.csv', index_col=0)

In [63]:
smooth = 5
y_train_df = pd.DataFrame(train_df['y'])
y = train_df['y']
y = np.log(y + smooth)
y_mean = np.mean(y)
y = y-y_mean
y_train_df["y"] = y

In [64]:
train_df.drop(['y'], axis=1, inplace=True)

In [65]:
def munge(df):
    all_df = pd.DataFrame(df.values, index=df.index, columns=df.columns, copy=True)
    all_df.drop(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8'], axis=1, inplace=True)
    
    
    #删除取值相同的特征
    all_df.drop(['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290' ,'X293' ,'X297', 'X330' ,'X347'], axis=1, inplace=True)
    
    #构造新特征
    all_df['parts'] = all_df.sum(axis=1)
    return all_df

In [66]:
munged_train_df = munge(train_df)
munged_test_df = munge(test_df)

In [67]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

scaler.fit(munged_train_df['parts'])

scaled = scaler.transform(munged_train_df['parts'])
munged_train_df['parts'] = scaled

scaled = scaler.transform(munged_test_df['parts'])
munged_test_df['parts'] = scaled



In [68]:
# Convert categorical features using one-hot encoding.
def onehot(onehot_df, df, column_name, fill_na):
    onehot_df[column_name] = df[column_name]
    if fill_na is not None:
        onehot_df[column_name].fillna(fill_na, inplace=True)

    dummies = pd.get_dummies(onehot_df[column_name], prefix = column_name)
    
    onehot_df = onehot_df.join(dummies)
    onehot_df = onehot_df.drop([column_name], axis=1)
    return onehot_df

def munge_onehot(df):
    onehot_df = pd.DataFrame(index = df.index)

    onehot_df = onehot(onehot_df, df, "X0", None)
    onehot_df = onehot(onehot_df, df, "X1", None)
    onehot_df = onehot(onehot_df, df, "X2", None)
    onehot_df = onehot(onehot_df, df, "X3", None)
    onehot_df = onehot(onehot_df, df, "X4", None)
    onehot_df = onehot(onehot_df, df, "X5", None)
    onehot_df = onehot(onehot_df, df, "X6", None)
    onehot_df = onehot(onehot_df, df, "X8", None)
    
    return onehot_df

In [69]:
onehot_df = munge_onehot(train_df)
munged_train_df = munged_train_df.join(onehot_df)

In [70]:
onehot_df = munge_onehot(test_df)
munged_test_df = munged_test_df.join(onehot_df)

In [71]:
set(munged_test_df) - set(munged_train_df)

{'X0_ae',
 'X0_ag',
 'X0_an',
 'X0_av',
 'X0_bb',
 'X0_p',
 'X2_ab',
 'X2_ad',
 'X2_aj',
 'X2_ax',
 'X2_u',
 'X2_w',
 'X5_a',
 'X5_b',
 'X5_t',
 'X5_z'}

In [72]:
#删除test中有的  而train中没有的
munged_test_df.drop(['X0_ae', 'X0_ag', 'X0_an', 'X0_av', 'X0_bb', 'X0_p',
                     'X2_ab', 'X2_ad', 'X2_aj', 'X2_ax', 'X2_u', 'X2_w', 'X5_a', 'X5_b', 'X5_t', 'X5_z'], axis=1, inplace=True)

In [73]:
set(munged_train_df) - set(munged_test_df)

{'X0_aa',
 'X0_ab',
 'X0_ac',
 'X0_q',
 'X2_aa',
 'X2_ar',
 'X2_c',
 'X2_l',
 'X2_o',
 'X5_u'}

In [74]:
#删除train中有的  而test中没有的
munged_train_df.drop(['X0_aa', 'X0_ab', 'X0_ac', 'X0_q', 'X2_aa', 'X2_ar', 'X2_c', 'X2_l', 'X2_o', 'X5_u'], axis=1, inplace=True)

In [75]:
#删除一些占比非常不平衡的特征

In [76]:
munged_train_df['X10'].value_counts()[0]

4153

In [77]:
s = munged_train_df.shape[0]
drop_names = []
for c in munged_train_df.drop(['parts'], axis=1).columns:
    a = munged_train_df[c].value_counts()[0] / s
    b = munged_train_df[c].value_counts()[1] / s
    if (a < 0.05 or b < 0.05):
        drop_names.append(c)

In [78]:
len(drop_names)

373

In [79]:
munged_train_df.drop(drop_names, axis=1, inplace=True)
munged_test_df.drop(drop_names, axis=1, inplace=True)

In [80]:
from sklearn.decomposition import PCA, FastICA

In [91]:
def KFoldCV(model, num_fold, X_train, y_train):
    cv_results = []
    kf = KFold(n_splits=num_fold, random_state=27)
    for train_index, test_index in kf.split(X_train):
        X_sub_train = X_train[train_index]
        X_sub_test = X_train[test_index]
        y_sub_train = y_train[train_index]
        y_sub_test = y_train[test_index]
        model.fit(X_sub_train, y_sub_train)
        y_pred = model.predict(X_sub_test)
        score = r2_score(np.exp(y_sub_test + y_mean) - smooth, np.exp(y_pred + y_mean) - smooth)
        cv_results.append(score)
    print('mean:%f std:%f'%(np.mean(cv_results), np.std(cv_results)))
    return np.mean(cv_results), np.std(cv_results)

In [92]:
def try_comp(n_pca_comp, n_ica_comp, n_grp_comp, n_srp_comp):
    pca = PCA(n_components=n_pca_comp, random_state=42)
    pca2_results_train = pca.fit_transform(munged_train_df)
    pca2_results_test = pca.transform(munged_test_df)

    #ICA
    ica = FastICA(n_components=n_ica_comp, random_state=42)
    ica2_results_train = ica.fit_transform(munged_train_df)
    ica2_results_test = ica.transform(munged_test_df)

    # GRP
    grp = GaussianRandomProjection(n_components=n_grp_comp, eps=0.1, random_state=420)
    grp_results_train = grp.fit_transform(munged_train_df)
    grp_results_test = grp.transform(munged_test_df)

    # SRP
    srp = SparseRandomProjection(n_components=n_srp_comp, dense_output=True, random_state=420)
    srp_results_train = srp.fit_transform(munged_train_df)
    srp_results_test = srp.transform(munged_test_df)
    
    src_train_df = munged_train_df.copy()
    
    # Append decomposition components to datasets
    for i in range(1, n_pca_comp+1):
        src_train_df['pca_' + str(i)] = pca2_results_train[:,i-1]
        
    for i in range(1, n_ica_comp+1):
        src_train_df['ica_' + str(i)] = ica2_results_train[:,i-1]
        
    for i in range(1, n_grp_comp+1):
        src_train_df['grp_' + str(i)] = grp_results_train[:,i-1]
        
    for i in range(1, n_srp_comp+1):
        src_train_df['srp_' + str(i)] = srp_results_train[:,i-1]
        
    print('pca:%d  ica:%d  grp:%d srp:%d'%(n_pca_comp, n_ica_comp, n_grp_comp, n_srp_comp))
    return train_lightgbm(src_train_df)

In [93]:
def try_comp2(n_pca_comp, n_ica_comp):
    pca = PCA(n_components=n_pca_comp, random_state=42)
    pca2_results_train = pca.fit_transform(munged_train_df)
    pca2_results_test = pca.transform(munged_test_df)

    #ICA
    ica = FastICA(n_components=n_ica_comp, random_state=42)
    ica2_results_train = ica.fit_transform(munged_train_df)
    ica2_results_test = ica.transform(munged_test_df)

    
    src_train_df = munged_train_df.copy()
    
    # Append decomposition components to datasets
    for i in range(1, n_pca_comp+1):
        src_train_df['pca_' + str(i)] = pca2_results_train[:,i-1]
        
    for i in range(1, n_ica_comp+1):
        src_train_df['ica_' + str(i)] = ica2_results_train[:,i-1]
        
        
    print('pca:%d  ica:%d'%(n_pca_comp, n_ica_comp))
    return train_lightgbm(src_train_df)

In [94]:
def train_lightgbm(train_df):
    
    X_train, X_test, y_train, y_test = train_test_split(train_df.values, y_train_df['y'].values, test_size=0.5, random_state=1729)
    X_all_train = train_df.values
    y_all_train = y_train_df['y'].values
    
    estimator = lgb.LGBMRegressor(max_depth=4, num_leaves=9, learning_rate=0.005, n_estimators=700, subsample=0.9, seed=1729)

#     param_grid = {
#         'num_leaves':[9, 10, 11],
#         'learning_rate': [0.005, 0.01],
#         'n_estimators': [500, 700],
#         'subsample':[0.90, 0.95]

#     }

#     gbm = GridSearchCV(estimator, param_grid)

#     gbm.fit(X_train, y_train)
#     y_pred = gbm.predict(X_test)

#     print('Best parameters found by grid search are:', gbm.best_params_)
#     print('Best score:%f'%(r2_score(y_test, y_pred)))
    
#     estimator.set_params(**gbm.best_params_)

    
    return KFoldCV(estimator, 5, X_all_train, y_all_train)

In [38]:
max_mean = 0
min_std = 10

for pca_comp in range(10,12):
    for ica_comp in range(7, 11):
        for grp_comp in range(1, 5):
            for srp_comp in range(1, 5):
                mean, std = try_comp(pca_comp, ica_comp, grp_comp, srp_comp)
                print('==============================================================')
                if max_mean < mean:
                    max_mean = mean
                if min_std > std:
                    min_std = std
                    
print('max_mean=%f min_std=%f'%(max_mean, min_std))
                    

pca:10  ica:7  grp:1 srp:1
mean:0.556222 std:0.074725
pca:10  ica:7  grp:1 srp:2
mean:0.556400 std:0.074302
pca:10  ica:7  grp:1 srp:3
mean:0.556066 std:0.074604
pca:10  ica:7  grp:1 srp:4
mean:0.556390 std:0.073721
pca:10  ica:7  grp:2 srp:1
mean:0.556158 std:0.074664
pca:10  ica:7  grp:2 srp:2
mean:0.556339 std:0.074315
pca:10  ica:7  grp:2 srp:3
mean:0.555927 std:0.074541
pca:10  ica:7  grp:2 srp:4
mean:0.556447 std:0.073783
pca:10  ica:7  grp:3 srp:1
mean:0.555796 std:0.074722
pca:10  ica:7  grp:3 srp:2
mean:0.555985 std:0.074553
pca:10  ica:7  grp:3 srp:3
mean:0.555523 std:0.074503
pca:10  ica:7  grp:3 srp:4
mean:0.555985 std:0.074079
pca:10  ica:7  grp:4 srp:1
mean:0.555669 std:0.074959
pca:10  ica:7  grp:4 srp:2
mean:0.555768 std:0.074610
pca:10  ica:7  grp:4 srp:3
mean:0.555192 std:0.074762
pca:10  ica:7  grp:4 srp:4
mean:0.555784 std:0.074447
pca:10  ica:8  grp:1 srp:1
mean:0.555713 std:0.073159
pca:10  ica:8  grp:1 srp:2
mean:0.555416 std:0.072596
pca:10  ica:8  grp:1 srp:3
m

In [95]:
mean, std = try_comp(10, 10, 6, 5)

pca:10  ica:10  grp:6 srp:5
mean:0.562695 std:0.075168


In [96]:
y_mean=4.653474409882869

4.653474409882869

In [89]:
def comp_result(n_pca_comp, n_ica_comp, n_grp_comp, n_srp_comp):
    pca = PCA(n_components=n_pca_comp, random_state=42)
    pca2_results_train = pca.fit_transform(munged_train_df)
    pca2_results_test = pca.transform(munged_test_df)

    #ICA
    ica = FastICA(n_components=n_ica_comp, random_state=42)
    ica2_results_train = ica.fit_transform(munged_train_df)
    ica2_results_test = ica.transform(munged_test_df)

    # GRP
    grp = GaussianRandomProjection(n_components=n_grp_comp, eps=0.1, random_state=420)
    grp_results_train = grp.fit_transform(munged_train_df)
    grp_results_test = grp.transform(munged_test_df)

    # SRP
    srp = SparseRandomProjection(n_components=n_srp_comp, dense_output=True, random_state=420)
    srp_results_train = srp.fit_transform(munged_train_df)
    srp_results_test = srp.transform(munged_test_df)
    
    src_train_df = munged_train_df.copy()
    src_test_df = munged_test_df.copy()
    # Append decomposition components to datasets
    for i in range(1, n_pca_comp+1):
        src_train_df['pca_' + str(i)] = pca2_results_train[:,i-1]
        src_test_df['pca_' + str(i)] = pca2_results_test[:, i-1]
        
    for i in range(1, n_ica_comp+1):
        src_train_df['ica_' + str(i)] = ica2_results_train[:,i-1]
        src_test_df['ica_' + str(i)] = ica2_results_test[:, i-1]
        
    for i in range(1, n_grp_comp+1):
        src_train_df['grp_' + str(i)] = grp_results_train[:,i-1]
        src_test_df['grp_' + str(i)] = grp_results_test[:, i-1]
        
    for i in range(1, n_srp_comp+1):
        src_train_df['srp_' + str(i)] = srp_results_train[:,i-1]
        src_test_df['srp_' + str(i)] = srp_results_test[:, i-1]
        
    src_train_df.to_csv('../data/offline/train.csv')
    src_test_df.to_csv('../data/offline/test.csv')
    y_train_df.to_csv('../data/offline/y_train.csv')

In [90]:
comp_result(10, 10, 6, 5)