In [1]:
import numpy as np
import pandas as pd

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
print("train's shape:",train.shape)
print("test's shape:",test.shape)

train's shape: (891, 12)
test's shape: (418, 11)


In [2]:
train.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


### 1 查看缺失值个数

In [3]:
# train.apply(lambda x: sum(x.isnull()))
# isnull表示是否缺失值，isnan表示是否为空值
train_null = train.isnull().sum()
test_null = test.isnull().sum()

#### 查看缺失值比例

In [4]:
df_null = pd.DataFrame({"train_null":train_null, "test_null":test_null}).sort_values(["train_null"], ascending=False)
df_null["train_null_rate"] = df_null["train_null"] / train.shape[0]
df_null["test_null_rate"] = df_null["test_null"] / test.shape[0]
df_null

Unnamed: 0,test_null,train_null,train_null_rate,test_null_rate
Cabin,327.0,687,0.771044,0.782297
Age,86.0,177,0.198653,0.205742
Embarked,0.0,2,0.002245,0.0
Fare,1.0,0,0.0,0.002392
Name,0.0,0,0.0,0.0
Parch,0.0,0,0.0,0.0
PassengerId,0.0,0,0.0,0.0
Pclass,0.0,0,0.0,0.0
Sex,0.0,0,0.0,0.0
SibSp,0.0,0,0.0,0.0


### 2 缺失值较少时，对于数值型变量可以用均值或中位数代替；对于类别型变量可以考虑用众数代替

#### 2.1 用训练集Fare的均值代替test中Fare特征的缺失值

In [5]:
test['Fare'] = test['Fare'].fillna(train['Fare'].mean())
print("test中Fare列缺失值个数 :",test['Fare'].isnull().sum())

test中Fare列缺失值个数 : 0


#### 2.2 用训练集中Embarked列的众数代替其缺失值

In [6]:
# Series.value_counts(normalize=False, sort=True, ascending=False, bins=None, dropna=True)
train['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [7]:
train['Embarked'] = train['Embarked'].fillna("S")
print("train中Embarked列缺失值个数 :",train['Embarked'].isnull().sum())

train中Embarked列缺失值个数 : 0


#### 2.3 在train、test中Age列的缺失值占比均为20%左右，考虑用建模来预测确实的年龄值

### 3 构建哑变量

In [8]:
train_data = train.drop("Survived", axis=1)
full_data = pd.concat([train_data, test])
full_data.index= range(len(full_data))
full_data.shape

(1309, 11)

In [9]:
full_data = full_data.drop(["Name","Ticket","Cabin"], axis=1)
full_data["Pclass"] = full_data["Pclass"].map({1:"1st", 2:"2nd", 3:"3rd"})
print(full_data.shape)

(1309, 8)


##### pandas.get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False)

In [10]:
dummy_list = ["Pclass", "Sex", "Embarked"]
dummy_data = full_data.loc[:,dummy_list]
dummy_df = pd.get_dummies(dummy_data)
print(dummy_df.shape)
dummy_df.head()

(1309, 8)


Unnamed: 0,Pclass_1st,Pclass_2nd,Pclass_3rd,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,0,1,0,1,0,0,1
1,1,0,0,1,0,1,0,0
2,0,0,1,1,0,0,0,1
3,1,0,0,1,0,0,0,1
4,0,0,1,0,1,0,0,1


In [11]:
temp = full_data.drop(dummy_list, axis=1)
full_data = temp.join(dummy_df,how="outer")
print(full_data.shape)
full_data.head()

(1309, 13)


Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare,Pclass_1st,Pclass_2nd,Pclass_3rd,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,22.0,1,0,7.25,0,0,1,0,1,0,0,1
1,2,38.0,1,0,71.2833,1,0,0,1,0,1,0,0
2,3,26.0,0,0,7.925,0,0,1,1,0,0,0,1
3,4,35.0,1,0,53.1,1,0,0,1,0,0,0,1
4,5,35.0,0,0,8.05,0,0,1,0,1,0,0,1


In [12]:
def age_class(x):
    if x < 6:
        return 'child'
    elif 6 <= x < 20:
        return 'teenager'
    elif 20 <= x < 30:
        return 'gold_age'
    elif x > 35:
        return 'old'    
    else:
        return 'unkown'

In [13]:
full_data['age_class'] = full_data['Age'].apply(age_class)
age_dummies = pd.get_dummies(full_data["age_class"])
full_data = full_data.join(age_dummies).drop("age_class", axis=1)
full_data.head(3)

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare,Pclass_1st,Pclass_2nd,Pclass_3rd,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,child,gold_age,old,teenager,unkown
0,1,22.0,1,0,7.25,0,0,1,0,1,0,0,1,0,1,0,0,0
1,2,38.0,1,0,71.2833,1,0,0,1,0,1,0,0,0,0,1,0,0
2,3,26.0,0,0,7.925,0,0,1,1,0,0,0,1,0,1,0,0,0


In [14]:
xgb_train = full_data.iloc[:891,:]
label = train["Survived"]
X_train, X_test, y_train, y_test = train_test_split(xgb_train, label, test_size=0.2,random_state=2018)

model = xgb.XGBClassifier(random_state=2018)
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=2018, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)