In [1]:
import pandas as pd
import numpy as np
from scipy.stats import skew

import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import validation_curve
from sklearn.model_selection import learning_curve
from sklearn import metrics
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.grid_search import GridSearchCV
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection



In [2]:
train_df = pd.read_csv('../data/original/train.csv', index_col=0)
test_df = pd.read_csv('../data/original/test.csv', index_col=0)

In [3]:
test_df.shape

(4209, 376)

In [4]:
train_df.shape

(4209, 377)

In [5]:
label_df = pd.DataFrame(train_df['y'])

In [6]:
#训练集中剔除异常值
train_df.drop([681, 2396, 2903, 6273,# y>160
              2581, 2584, 2585, 2586,#X4_Trans<2.5
              1770], inplace=True)#y>250


In [7]:
label_df.drop([681, 2396, 2903, 6273,# y>160
              2581, 2584, 2585, 2586,#X4_Trans<2.5
              1770], inplace=True)#y>250

In [8]:
train_df.drop(['y'], axis=1, inplace=True)

In [9]:
def munge(df):
    all_df = pd.DataFrame(df.values, index=df.index, columns=df.columns, copy=True)
    all_df.drop(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8'], axis=1, inplace=True)
    
    
    #删除取值相同的特征
    all_df.drop(['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290' ,'X293' ,'X297', 'X330' ,'X347'], axis=1, inplace=True)
    
    #构造新特征
    all_df['parts'] = all_df.sum(axis=1)
    return all_df

In [10]:
munged_train_df = munge(train_df)
munged_test_df = munge(test_df)

In [11]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

scaler.fit(munged_train_df['parts'])

scaled = scaler.transform(munged_train_df['parts'])
munged_train_df['parts'] = scaled

scaled = scaler.transform(munged_test_df['parts'])
munged_test_df['parts'] = scaled



In [12]:
# Convert categorical features using one-hot encoding.
def onehot(onehot_df, df, column_name, fill_na):
    onehot_df[column_name] = df[column_name]
    if fill_na is not None:
        onehot_df[column_name].fillna(fill_na, inplace=True)

    dummies = pd.get_dummies(onehot_df[column_name], prefix = column_name)
    
    onehot_df = onehot_df.join(dummies)
    onehot_df = onehot_df.drop([column_name], axis=1)
    return onehot_df

def munge_onehot(df):
    onehot_df = pd.DataFrame(index = df.index)

    onehot_df = onehot(onehot_df, df, "X0", None)
    onehot_df = onehot(onehot_df, df, "X1", None)
    onehot_df = onehot(onehot_df, df, "X2", None)
    onehot_df = onehot(onehot_df, df, "X3", None)
    onehot_df = onehot(onehot_df, df, "X4", None)
    onehot_df = onehot(onehot_df, df, "X5", None)
    onehot_df = onehot(onehot_df, df, "X6", None)
    onehot_df = onehot(onehot_df, df, "X8", None)
    
    return onehot_df

In [13]:
onehot_df = munge_onehot(train_df)
munged_train_df = munged_train_df.join(onehot_df)

In [14]:
onehot_df = munge_onehot(test_df)
munged_test_df = munged_test_df.join(onehot_df)

In [15]:
munged_test_df.shape

(4209, 558)

In [16]:
munged_train_df.shape

(4200, 549)

In [17]:
only_test_cols = set(munged_test_df) - set(munged_train_df)

In [18]:
#删除test中有的  而train中没有的
munged_test_df.drop(list(only_test_cols), axis=1, inplace=True)

In [19]:
only_train_cols = set(munged_train_df) - set(munged_test_df)

In [20]:
#删除train中有的  而test中没有的
munged_train_df.drop(list(only_train_cols), axis=1, inplace=True)

In [21]:
munged_train_df.to_csv('../data/offline/train.csv')
munged_test_df.to_csv('../data/offline/test.csv')
label_df.to_csv('../data/offline/y_train.csv')