In [2]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.impute import SimpleImputer

In [3]:
train_set=pd.read_csv('train.csv')
test_set=pd.read_csv('test.csv')
gender_submission=pd.read_csv('gender_submission.csv')

In [4]:
train_set = train_set.dropna(subset=['Embarked'])
train_set.shape

(889, 12)

In [5]:
X_train_all=train_set.drop(['PassengerId','Survived','Name','Ticket','Cabin'],axis=1)
X_test=test_set.drop(['PassengerId','Name','Ticket','Cabin'],axis=1)
Y_train_all=train_set['Survived']

In [6]:
X_train_all.head(10)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S
5,3,male,,0,0,8.4583,Q
6,1,male,54.0,0,0,51.8625,S
7,3,male,2.0,3,1,21.075,S
8,3,female,27.0,0,2,11.1333,S
9,2,female,14.0,1,0,30.0708,C


In [7]:
s = (X_train_all.dtypes == 'object')
object_cols = list(s[s].index)
object_cols

['Sex', 'Embarked']

In [8]:
t= (X_train_all.dtypes != 'object')
numerical_cols = list(t[t].index)
numerical_cols

['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [16]:
numerical_transformer = SimpleImputer(strategy='constant')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, object_cols)
    ])

In [41]:
from xgboost import XGBRegressor, XGBClassifier

model = XGBRegressor(n_estimators=1000, n_jobs=4)
model1 = XGBClassifier(learning_rate=0.05,n_estimators=1000, n_jobs=4)
model2 = XGBClassifier(learning_rate=0.1,n_estimators=1000, n_jobs=4)

In [13]:
help(XGBClassifier)

Help on class XGBClassifier in module xgboost.sklearn:

class XGBClassifier(XGBModel, sklearn.base.ClassifierMixin)
 |  XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=100, verbosity=1, silent=None, objective='binary:logistic', booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, colsample_bynode=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=0, seed=None, missing=None, **kwargs)
 |  
 |  Implementation of the scikit-learn API for XGBoost classification.
 |  
 |  Parameters
 |  ----------
 |  max_depth : int
 |      Maximum tree depth for base learners.
 |  learning_rate : float
 |      Boosting learning rate (xgb's "eta")
 |  n_estimators : int
 |      Number of trees to fit.
 |  verbosity : int
 |      The degree of verbosity. Valid values are 0 (silent) - 3 (debug).
 |  silent : boolean
 |      Whether to print messages while running boosting. Deprecat

In [42]:
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model2)
                             ])

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X_train,X_val,y_train,y_val = train_test_split(X_train_all, Y_train_all, random_state = 0,test_size=0.2)

In [43]:
my_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='constant',
                                                                verbose=0),
                                                  ['Pclass', 'Age', 'SibSp',
                                                   'Parch', 'Fare']),
                                                 ('cat',
              

In [44]:
y_pred=my_pipeline.predict(X_val)

In [25]:
from sklearn.metrics import classification_report, confusion_matrix

In [45]:
confusion_matrix(y_val,y_pred)

array([[85, 20],
       [23, 50]], dtype=int64)

In [46]:
classification_report(y_val,y_pred)

'              precision    recall  f1-score   support\n\n           0       0.79      0.81      0.80       105\n           1       0.71      0.68      0.70        73\n\n    accuracy                           0.76       178\n   macro avg       0.75      0.75      0.75       178\nweighted avg       0.76      0.76      0.76       178\n'

In [21]:
my_pipeline.fit(X_train_all, Y_train_all)



Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='constant',
                                                                verbose=0),
                                                  ['Pclass', 'Age', 'SibSp',
                                                   'Parch', 'Fare']),
                                                 ('cat',
              

In [22]:
preds = my_pipeline.predict(X_test)

In [23]:
preds = np.where(preds > 0.5, 1, 0)

In [24]:
preds

array([0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [25]:
ss1=pd.read_csv('gender_submission.csv')
ss1=ss1.drop('Survived',axis=1)
ss1['Survived']=preds
ss1.to_csv('submission10.csv',index=False)