In [1]:
import pandas as pd
import numpy as np
from scipy.stats import skew

import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import validation_curve
from sklearn.model_selection import learning_curve
from sklearn import metrics
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.grid_search import GridSearchCV
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection



In [2]:
train_df = pd.read_csv('../data/original/train.csv', index_col=0)
test_df = pd.read_csv('../data/original/test.csv', index_col=0)

In [8]:
test_df.shape

(4209, 376)

In [9]:
train_df.shape

(4200, 376)

In [5]:
label_df = pd.DataFrame(train_df['y'])

In [6]:
#训练集中剔除异常值
train_df.drop([681, 2396, 2903, 6273,# y>160
              2581, 2584, 2585, 2586,#X4_Trans<2.5
              1770], inplace=True)#y>250


In [40]:
label_df.drop([681, 2396, 2903, 6273,# y>160
              2581, 2584, 2585, 2586,#X4_Trans<2.5
              1770], inplace=True)#y>250

In [7]:
train_df.drop(['y'], axis=1, inplace=True)

In [10]:
def munge(df):
    all_df = pd.DataFrame(df.values, index=df.index, columns=df.columns, copy=True)
    all_df.drop(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8'], axis=1, inplace=True)
    
    
    #删除取值相同的特征
    all_df.drop(['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290' ,'X293' ,'X297', 'X330' ,'X347'], axis=1, inplace=True)
    
    #构造新特征
    all_df['parts'] = all_df.sum(axis=1)
    return all_df

In [11]:
munged_train_df = munge(train_df)
munged_test_df = munge(test_df)

In [14]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

scaler.fit(munged_train_df['parts'])

scaled = scaler.transform(munged_train_df['parts'])
munged_train_df['parts'] = scaled

scaled = scaler.transform(munged_test_df['parts'])
munged_test_df['parts'] = scaled



In [15]:
# Convert categorical features using one-hot encoding.
def onehot(onehot_df, df, column_name, fill_na):
    onehot_df[column_name] = df[column_name]
    if fill_na is not None:
        onehot_df[column_name].fillna(fill_na, inplace=True)

    dummies = pd.get_dummies(onehot_df[column_name], prefix = column_name)
    
    onehot_df = onehot_df.join(dummies)
    onehot_df = onehot_df.drop([column_name], axis=1)
    return onehot_df

def munge_onehot(df):
    onehot_df = pd.DataFrame(index = df.index)

    onehot_df = onehot(onehot_df, df, "X0", None)
    onehot_df = onehot(onehot_df, df, "X1", None)
    onehot_df = onehot(onehot_df, df, "X2", None)
    onehot_df = onehot(onehot_df, df, "X3", None)
    onehot_df = onehot(onehot_df, df, "X4", None)
    onehot_df = onehot(onehot_df, df, "X5", None)
    onehot_df = onehot(onehot_df, df, "X6", None)
    onehot_df = onehot(onehot_df, df, "X8", None)
    
    return onehot_df

In [16]:
onehot_df = munge_onehot(train_df)
munged_train_df = munged_train_df.join(onehot_df)

In [17]:
onehot_df = munge_onehot(test_df)
munged_test_df = munged_test_df.join(onehot_df)

In [18]:
munged_test_df.shape

(4209, 558)

In [19]:
munged_train_df.shape

(4200, 549)

In [21]:
only_test_cols = set(munged_test_df) - set(munged_train_df)

['X0_av',
 'X2_u',
 'X4_a',
 'X0_an',
 'X2_ax',
 'X2_ab',
 'X5_a',
 'X0_ae',
 'X0_p',
 'X0_bb',
 'X5_b',
 'X0_ag',
 'X2_aj',
 'X2_ad',
 'X5_z',
 'X5_t',
 'X2_w',
 'X4_c',
 'X4_b']

In [24]:
#删除test中有的  而train中没有的
munged_test_df.drop(list(only_test_cols), axis=1, inplace=True)

In [25]:
only_train_cols = set(munged_train_df) - set(munged_test_df)

In [26]:
#删除train中有的  而test中没有的
munged_train_df.drop(list(only_train_cols), axis=1, inplace=True)

In [17]:
#删除一些占比非常不平衡的特征

In [27]:
munged_train_df['X10'].value_counts()[0]

4144

In [28]:
s = munged_train_df.shape[0]
drop_names = []
for c in munged_train_df.drop(['parts'], axis=1).columns:
    a = munged_train_df[c].value_counts()[0] / s
    b = munged_train_df[c].value_counts()[1] / s
    if (a < 0.011 or b < 0.011):
        print('%s p1 = %f p2 = %f'%(c, a, b))
        drop_names.append(c)

X15 p1 = 0.999524 p2 = 0.000476
X16 p1 = 0.997619 p2 = 0.002381
X17 p1 = 0.992381 p2 = 0.007619
X18 p1 = 0.992143 p2 = 0.007857
X21 p1 = 0.997381 p2 = 0.002619
X24 p1 = 0.998095 p2 = 0.001905
X26 p1 = 0.995000 p2 = 0.005000
X30 p1 = 0.995476 p2 = 0.004524
X33 p1 = 0.999762 p2 = 0.000238
X34 p1 = 0.994524 p2 = 0.005476
X36 p1 = 0.995476 p2 = 0.004524
X39 p1 = 0.999762 p2 = 0.000238
X40 p1 = 0.999286 p2 = 0.000714
X42 p1 = 0.999762 p2 = 0.000238
X53 p1 = 0.993095 p2 = 0.006905
X55 p1 = 0.994762 p2 = 0.005238
X59 p1 = 0.999286 p2 = 0.000714
X60 p1 = 0.998571 p2 = 0.001429
X62 p1 = 0.994048 p2 = 0.005952
X65 p1 = 0.997857 p2 = 0.002143
X67 p1 = 0.998095 p2 = 0.001905
X74 p1 = 0.000714 p2 = 0.999286
X78 p1 = 0.994286 p2 = 0.005714
X83 p1 = 0.998810 p2 = 0.001190
X86 p1 = 0.998571 p2 = 0.001429
X87 p1 = 0.999048 p2 = 0.000952
X88 p1 = 0.992857 p2 = 0.007143
X89 p1 = 0.999286 p2 = 0.000714
X90 p1 = 0.992619 p2 = 0.007381
X91 p1 = 0.998333 p2 = 0.001667
X92 p1 = 0.999048 p2 = 0.000952
X94 p1 =

KeyError: 1

In [29]:
len(drop_names)

120

In [30]:
munged_train_df.drop(drop_names, axis=1, inplace=True)
munged_test_df.drop(drop_names, axis=1, inplace=True)

In [31]:
munged_train_df.shape


(4200, 419)

In [32]:
munged_test_df.shape

(4209, 419)

In [33]:
from sklearn.decomposition import PCA, FastICA

In [34]:
#PCA

pca_n_comp = 7
pca = PCA(n_components=pca_n_comp, random_state=420)
pca2_results_train = pca.fit_transform(munged_train_df)
pca2_results_test = pca.transform(munged_test_df)

In [35]:
#ICA

ica_n_comp = 10
ica = FastICA(n_components=ica_n_comp, random_state=420)
ica2_results_train = ica.fit_transform(munged_train_df)
ica2_results_test = ica.transform(munged_test_df)

In [36]:
# GRP

grp_n_comp = 6
grp = GaussianRandomProjection(n_components=grp_n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(munged_train_df)
grp_results_test = grp.transform(munged_test_df)

# SRP

srp_n_comp = 5
srp = SparseRandomProjection(n_components=srp_n_comp, dense_output=True, random_state=420)
srp_results_train = srp.fit_transform(munged_train_df)
srp_results_test = srp.transform(munged_test_df)

In [37]:
# tSVD

tsvd_n_comp = 11
tsvd = TruncatedSVD(n_components=tsvd_n_comp, random_state=420)
tsvd_results_train = tsvd.fit_transform(munged_train_df)
tsvd_results_test = tsvd.transform(munged_test_df)

In [38]:
# Append decomposition components to datasets
for i in range(1, pca_n_comp+1):
    munged_train_df['pca_' + str(i)] = pca2_results_train[:,i-1]
    munged_test_df['pca_' + str(i)] = pca2_results_test[:, i-1]
for i in range(1, ica_n_comp+1):
    munged_train_df['ica_' + str(i)] = ica2_results_train[:,i-1]
    munged_test_df['ica_' + str(i)] = ica2_results_test[:, i-1]
for i in range(1, grp_n_comp+1):
    munged_train_df['grp_' + str(i)] = grp_results_train[:,i-1]
    munged_test_df['grp_' + str(i)] = grp_results_test[:, i-1]
for i in range(1, srp_n_comp+1):
    munged_train_df['srp_' + str(i)] = srp_results_train[:,i-1]
    munged_test_df['srp_' + str(i)] = srp_results_test[:, i-1]
for i in range(1, tsvd_n_comp+1):
    munged_train_df['tsvd_' + str(i)] = tsvd_results_train[:,i-1]
    munged_test_df['tsvd_' + str(i)] = tsvd_results_test[:, i-1]


In [41]:
munged_train_df.to_csv('../data/offline/train.csv')
munged_test_df.to_csv('../data/offline/test.csv')
label_df.to_csv('../data/offline/y_train.csv')