In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

**Preprocessing**

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
features = ['Pclass','Sex','Age','Embarked','SibSp','Parch','Fare']
X_train = train[features]
X_test = test[features]
y_train = train['Survived']

X_train.info(), X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Pclass      891 non-null int64
Sex         891 non-null object
Age         714 non-null float64
Embarked    889 non-null object
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
dtypes: float64(2), int64(3), object(2)
memory usage: 48.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
Pclass      418 non-null int64
Sex         418 non-null object
Age         332 non-null float64
Embarked    418 non-null object
SibSp       418 non-null int64
Parch       418 non-null int64
Fare        417 non-null float64
dtypes: float64(2), int64(3), object(2)
memory usage: 23.0+ KB


(None, None)

In [5]:
X_train['Age'].fillna(X_train['Age'].mean(), inplace=True)
print(X_train['Embarked'].value_counts())
X_train['Embarked'].fillna('S', inplace=True)
X_train.info()

S    644
C    168
Q     77
Name: Embarked, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Pclass      891 non-null int64
Sex         891 non-null object
Age         891 non-null float64
Embarked    891 non-null object
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
dtypes: float64(2), int64(3), object(2)
memory usage: 48.9+ KB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [6]:
X_test['Fare'].fillna(X_test['Fare'].mean(), inplace=True)
X_test['Age'].fillna(X_test['Age'].mean(), inplace=True)
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
Pclass      418 non-null int64
Sex         418 non-null object
Age         418 non-null float64
Embarked    418 non-null object
SibSp       418 non-null int64
Parch       418 non-null int64
Fare        418 non-null float64
dtypes: float64(2), int64(3), object(2)
memory usage: 23.0+ KB


In [7]:
from sklearn.feature_extraction import DictVectorizer

In [8]:
dictVec = DictVectorizer(sparse=False)
print(type(X_train))

<class 'pandas.core.frame.DataFrame'>


In [9]:
X_train = dictVec.fit_transform(X_train.to_dict(orient='record'))
X_test = dictVec.transform(X_test.to_dict(orient='record'))

**Training**

In [10]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

In [11]:
rfc = RandomForestClassifier()
xgb = XGBClassifier()

In [13]:
print('rfc: ', cross_val_score(rfc, X_train, y_train, cv=5).mean())
print('xgb: ', cross_val_score(xgb, X_train, y_train, cv=5).mean())



rfc:  0.8070470925640837
xgb:  0.81824559798311


In [14]:
xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [16]:
xgb_pred = xgb.predict(X_test)

In [18]:
xgb_submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': xgb_pred})

In [19]:
xgb_submission.to_csv('xgb_submission.csv', index=False)

In [25]:
#Parallel search for better model
from sklearn.model_selection import GridSearchCV

In [26]:
params = {'max_depth':range(2,7), 'n_estimators':range(100, 1100, 200), 'learning_rate':[0.05, 0.1, 0.25, 0.5, 1.0]}

In [27]:
xgb_improve = XGBClassifier()

In [28]:
gs = GridSearchCV(xgb_improve, params, n_jobs=-1, cv=5, verbose=1)

In [29]:
gs.fit(X_train, y_train)

Fitting 5 folds for each of 125 candidates, totalling 625 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   23.0s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   53.3s
[Parallel(n_jobs=-1)]: Done 625 out of 625 | elapsed:  1.3min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='warn', n_jobs=-1,
             param_grid={'learning_rate': [0.05, 0.1, 0.25, 0.5, 1.0],
                         'max_depth': range(2, 7),
                         'n_estimators': range(

In [34]:
print (gs.best_score_)

0.835016835016835


In [35]:
xgb_improve_pred = gs.predict(X_test)
xgb_submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': xgb_improve_pred})

In [36]:
xgb_submission.to_csv('xgb_improve.csv', index=False)