In [1]:
import pandas as pd
import numpy as np
from scipy.stats import skew

import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import validation_curve
from sklearn.model_selection import learning_curve
from sklearn import metrics
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.grid_search import GridSearchCV
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection



In [2]:
train_df = pd.read_csv('../data/original/train.csv', index_col=0)
test_df = pd.read_csv('../data/original/test.csv', index_col=0)

In [4]:
label_df = pd.DataFrame(train_df['y'])
train_df.drop(['y'], axis=1, inplace=True)

In [5]:
def munge(df):
    all_df = pd.DataFrame(df.values, index=df.index, columns=df.columns, copy=True)
    all_df.drop(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8'], axis=1, inplace=True)
    
    
    #删除取值相同的特征
    all_df.drop(['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290' ,'X293' ,'X297', 'X330' ,'X347'], axis=1, inplace=True)
    
    #构造新特征
    all_df['parts'] = all_df.sum(axis=1)
    return all_df

In [6]:
munged_train_df = munge(train_df)
munged_test_df = munge(test_df)

In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

scaler.fit(munged_train_df['parts'])

scaled = scaler.transform(munged_train_df['parts'])
munged_train_df['parts'] = scaled

scaled = scaler.transform(munged_test_df['parts'])
munged_test_df['parts'] = scaled



In [8]:
# Convert categorical features using one-hot encoding.
def onehot(onehot_df, df, column_name, fill_na):
    onehot_df[column_name] = df[column_name]
    if fill_na is not None:
        onehot_df[column_name].fillna(fill_na, inplace=True)

    dummies = pd.get_dummies(onehot_df[column_name], prefix = column_name)
    
    onehot_df = onehot_df.join(dummies)
    onehot_df = onehot_df.drop([column_name], axis=1)
    return onehot_df

def munge_onehot(df):
    onehot_df = pd.DataFrame(index = df.index)

    onehot_df = onehot(onehot_df, df, "X0", None)
    onehot_df = onehot(onehot_df, df, "X1", None)
    onehot_df = onehot(onehot_df, df, "X2", None)
    onehot_df = onehot(onehot_df, df, "X3", None)
    onehot_df = onehot(onehot_df, df, "X4", None)
    onehot_df = onehot(onehot_df, df, "X5", None)
    onehot_df = onehot(onehot_df, df, "X6", None)
    onehot_df = onehot(onehot_df, df, "X8", None)
    
    return onehot_df

In [9]:
onehot_df = munge_onehot(train_df)
munged_train_df = munged_train_df.join(onehot_df)

In [10]:
onehot_df = munge_onehot(test_df)
munged_test_df = munged_test_df.join(onehot_df)

In [11]:
set(munged_test_df) - set(munged_train_df)

{'X0_ae',
 'X0_ag',
 'X0_an',
 'X0_av',
 'X0_bb',
 'X0_p',
 'X2_ab',
 'X2_ad',
 'X2_aj',
 'X2_ax',
 'X2_u',
 'X2_w',
 'X5_a',
 'X5_b',
 'X5_t',
 'X5_z'}

In [12]:
#删除test中有的  而train中没有的
munged_test_df.drop(['X0_ae', 'X0_ag', 'X0_an', 'X0_av', 'X0_bb', 'X0_p',
                     'X2_ab', 'X2_ad', 'X2_aj', 'X2_ax', 'X2_u', 'X2_w', 'X5_a', 'X5_b', 'X5_t', 'X5_z'], axis=1, inplace=True)

In [13]:
set(munged_train_df) - set(munged_test_df)

{'X0_aa',
 'X0_ab',
 'X0_ac',
 'X0_q',
 'X2_aa',
 'X2_ar',
 'X2_c',
 'X2_l',
 'X2_o',
 'X5_u'}

In [14]:
#删除train中有的  而test中没有的
munged_train_df.drop(['X0_aa', 'X0_ab', 'X0_ac', 'X0_q', 'X2_aa', 'X2_ar', 'X2_c', 'X2_l', 'X2_o', 'X5_u'], axis=1, inplace=True)

In [15]:
#删除一些占比非常不平衡的特征

In [16]:
munged_train_df['X10'].value_counts()[0]

4153

In [17]:
s = munged_train_df.shape[0]
drop_names = []
for c in munged_train_df.drop(['parts'], axis=1).columns:
    a = munged_train_df[c].value_counts()[0] / s
    b = munged_train_df[c].value_counts()[1] / s
    if (a < 0.05 or b < 0.05):
        print('%s p1 = %f p2 = %f'%(c, a, b))
        drop_names.append(c)

X10 p1 = 0.986695 p2 = 0.013305
X15 p1 = 0.999525 p2 = 0.000475
X16 p1 = 0.997387 p2 = 0.002613
X17 p1 = 0.992397 p2 = 0.007603
X18 p1 = 0.992160 p2 = 0.007840
X21 p1 = 0.997387 p2 = 0.002613
X23 p1 = 0.979330 p2 = 0.020670
X24 p1 = 0.998099 p2 = 0.001901
X26 p1 = 0.995011 p2 = 0.004989
X28 p1 = 0.967451 p2 = 0.032549
X29 p1 = 0.956997 p2 = 0.043003
X30 p1 = 0.995486 p2 = 0.004514
X32 p1 = 0.988833 p2 = 0.011167
X33 p1 = 0.999762 p2 = 0.000238
X34 p1 = 0.994536 p2 = 0.005464
X36 p1 = 0.995486 p2 = 0.004514
X38 p1 = 0.966738 p2 = 0.033262
X39 p1 = 0.999762 p2 = 0.000238
X40 p1 = 0.999287 p2 = 0.000713
X41 p1 = 0.988596 p2 = 0.011404
X42 p1 = 0.999762 p2 = 0.000238
X44 p1 = 0.988596 p2 = 0.011404
X47 p1 = 0.987170 p2 = 0.012830
X48 p1 = 0.977667 p2 = 0.022333
X52 p1 = 0.957710 p2 = 0.042290
X53 p1 = 0.993110 p2 = 0.006890
X54 p1 = 0.956522 p2 = 0.043478
X55 p1 = 0.994773 p2 = 0.005227
X56 p1 = 0.978855 p2 = 0.021145
X57 p1 = 0.986695 p2 = 0.013305
X59 p1 = 0.999287 p2 = 0.000713
X60 p1 =

In [18]:
len(drop_names)

373

In [19]:
munged_train_df.drop(drop_names, axis=1, inplace=True)
munged_test_df.drop(drop_names, axis=1, inplace=True)

In [20]:
from sklearn.decomposition import PCA, FastICA

In [21]:
munged_train_df.to_csv('../data/offline/train.csv')
munged_test_df.to_csv('../data/offline/test.csv')
label_df.to_csv('../data/offline/y_train.csv')