In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
#读取训练集
train = pd.read_csv('/Users/Administrator/Desktop/train.csv')
#读取测试集
test = pd.read_csv('/Users/Administrator/Desktop/test.csv')


def set_missing_ages(df): # 把已有的数值型特征取出来丢进Random Forest Regressor中 
    age_df = df[['Age','Fare', 'Parch', 'SibSp', 'Pclass']] # 乘客分成已知年龄和未知年龄两部分 
    known_age = age_df[age_df.Age.notnull()].as_matrix() 
    unknown_age = age_df[age_df.Age.isnull()].as_matrix()
    # y即目标年龄 
    y = known_age[:, 0] 
    # X即特征属性值 
    X = known_age[:, 1:] 
    # fit到RandomForestRegressor之中 
    rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1) 
    rfr.fit(X, y) 
    # 用得到的模型进行未知年龄结果预测 
    predictedAges = rfr.predict(unknown_age[:, 1::]) 
    # 用得到的预测结果填补原缺失数据 
    df.loc[ (df.Age.isnull()), 'Age' ] = predictedAges 
    return df, rfr 
train ,rfr = set_missing_ages(train)
tmp_df = test[['Age','Fare', 'Parch', 'SibSp', 'Pclass']] 
null_age = tmp_df[test.Age.isnull()].as_matrix() # 根据特征属性X预测年龄并补上 
X = null_age[:, 1:] 
predictedAges = rfr.predict(X) 
test.loc[ (test.Age.isnull()), 'Age' ] = predictedAges 
#features选取
X_train = train[['Pclass','Sex','Age','Embarked','SibSp','Parch','Fare']]
X_test = test[['Pclass','Sex','Age','Embarked','SibSp','Parch','Fare']]
 
y_train = train['Survived']
 
 
#填充训练集Embarked列缺失值
X_train['Embarked'].fillna('S')


#填充测试集缺失值
X_test['Embarked'].fillna('S')

X_test['Fare'].fillna(X_test['Fare'].mean())


#DictVectorizer进行特征提取
dict_vec = DictVectorizer(sparse=False)
X_train = dict_vec.fit_transform(X_train.to_dict(orient='record'))
X_test = dict_vec.transform(X_test.to_dict(orient='record'))
 
 
 
#模型选择XGB
xgb_model = xgb.XGBClassifier()
 
#设置参数
params = dict(booster='gbtree',
              objective='multi:softmax',
              num_class=2,
              learning_rate=0.1,
              max_depth=2,
              silent=0,)
# 设置迭代次数
plst = list(params.items())
num_rounds = 2000
 
# sklearn.cross_validation进行训练数据集划分，训练集和交叉验证集比例
train_x, val_X, train_y, val_y = train_test_split(X_train, y_train, test_size=0.2, random_state=1)
 
# xgb矩阵赋值
xgb_val = xgb.DMatrix(val_X, label=val_y)
xgb_train = xgb.DMatrix(train_x, label=train_y)
xgb_test = xgb.DMatrix(X_test)
 
#watchlist 方便查看运行情况
watchlist = [(xgb_train, 'train'), (xgb_val, 'val')]
 
# training model
# early_stopping_rounds 当设置的迭代次数较大时，early_stopping_rounds 可在一定的迭代次数内准确率没有提升就停止训练
model = xgb.train(plst, xgb_train, num_rounds, watchlist, early_stopping_rounds=100)
 
#测试集合预测值
preds = model.predict(xgb_test, ntree_limit=model.best_ntree_limit)
#结果输出
np.savetxt('/Users/Administrator/Desktop/gender_submission.csv', np.c_[range(1, len(X_test) + 1), preds], delimiter=',', header='Label', comments='', fmt='%d')