In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error

from xgboost import XGBClassifier



In [5]:
#import data
X_train_full = pd.read_csv("../Data/train.csv")
Test_data = pd.read_csv("../Data/test.csv")

In [6]:
#inspect the data

#X_train_full.head()
X_train_full.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
# Remove rows with missing target - in this case there are none
# set target and drop target from predictors

y = X_train_full["Survived"]
X_train_full.drop(["Survived"], axis = 1, inplace = True)

In [8]:
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y, train_size=0.8, test_size=0.2)

In [9]:
#DEAL WITH MISSING DATA
#get names of columns with missing values

col_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()]
print(col_with_missing)

['Age', 'Cabin', 'Embarked']


In [10]:
#how many values are missing in each column?
missing_values_count_by_column = X_train_full.isnull().sum()
print(missing_values_count_by_column)

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [11]:
#get numerical and categorical columns
all_columns = X_train_full.columns
print(all_columns)

numerical_columns = [col for col in X_train_full.columns if X_train_full[col].dtype in ["int64", "float64"]]
print("Numerical columns: ", numerical_columns)
categorical_columns = [col for col in X_train_full.columns if X_train_full[col].dtype in ["object"]]
print("Categorical columns: ", categorical_columns)

#check if added columns are all columns
len(all_columns) == len(categorical_columns + numerical_columns)

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Numerical columns:  ['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
Categorical columns:  ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']


True

In [12]:
#check cardinality of categorical columns

low_cardinality_cols = [col for col in X_train_full[categorical_columns] if X_train_full[col].nunique() < 10 and 
                        X_train_full[col].dtype == "object"]
print(low_cardinality_cols)

high_cardinality_cols = [col for col in X_train_full[categorical_columns] if X_train_full[col].nunique() >= 10 and 
                        X_train_full[col].dtype == "object"]

len(categorical_columns) == len(low_cardinality_cols + high_cardinality_cols)

['Sex', 'Embarked']


True

In [13]:
X_train_full[categorical_columns].nunique()

Name        891
Sex           2
Ticket      681
Cabin       147
Embarked      3
dtype: int64

In [14]:
#define numerical and categorical transformers (preprocessing)
#remember to change strategies later to see which one performs best

numerical_transformer = SimpleImputer(strategy = "constant")

low_categorical_transformer = Pipeline(steps = [("imputer", SimpleImputer(strategy = "most_frequent")), 
                                               ("OH_encoder", OneHotEncoder(handle_unknown = "ignore"))])

high_categorical_transformer = Pipeline(steps = [("imputer", SimpleImputer(strategy = "most_frequent")),
                                                 ("labeler", LabelEncoder())])

# #bundle them together in one preprocessor via ColumnTransformer: (syntax like pipeline but don't forget to add columns)

preprocessor = ColumnTransformer(transformers = [("num", numerical_transformer, numerical_columns),
                                                 ("low_cat", low_categorical_transformer, low_cardinality_cols)])

preprocessor.fit_transform(X_train)
preprocessor.transform(X_train)

array([[ 51.,   3.,   7., ...,   0.,   0.,   1.],
       [563.,   2.,  28., ...,   0.,   0.,   1.],
       [681.,   3.,   0., ...,   0.,   1.,   0.],
       ...,
       [158.,   3.,  30., ...,   0.,   0.,   1.],
       [177.,   3.,   0., ...,   0.,   0.,   1.],
       [694.,   3.,  25., ...,   1.,   0.,   0.]])

In [15]:
#Drop some data
X_train.drop(["Name", "Cabin", "Ticket"], axis = 1, inplace = True)
Test_data.drop(["Name", "Cabin", "Ticket"], axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [16]:
#Fit preprocessor
preprocessor.fit_transform(X_train)
preprocessor.transform(Test_data)

array([[8.920e+02, 3.000e+00, 3.450e+01, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [8.930e+02, 3.000e+00, 4.700e+01, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       [8.940e+02, 2.000e+00, 6.200e+01, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       ...,
       [1.307e+03, 3.000e+00, 3.850e+01, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       [1.308e+03, 3.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       [1.309e+03, 3.000e+00, 0.000e+00, ..., 1.000e+00, 0.000e+00,
        0.000e+00]])

In [17]:
#define the model

my_model = XGBClassifier(n_estimators=300, learning_rate=0.01, n_jobs=6, early_stopping_rounds = 5)


In [18]:
#pipeline preprocessor and model:

my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', my_model)
                             ])

In [19]:
my_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='constant',
                                                                verbose=0),
                                                  ['PassengerId', 'Pclass',
                                                   'Age', 'SibSp', 'Parch',
                                                   'Fare']),
     

In [20]:
my_model = XGBClassifier(n_estimators=350, learning_rate=0.005, n_jobs=6, early_stopping_rounds = 5)
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', my_model)
                             ])
my_pipeline.fit(X_train, y_train)
preds = my_pipeline.predict(X_valid)
score = mean_absolute_error(preds, y_valid)
print("MAE: ", score)

MAE:  0.19553072625698323


In [21]:
predictions = my_pipeline.predict(Test_data)


In [22]:
output = pd.DataFrame({'PassengerId': Test_data.PassengerId, 'Survived': predictions})
output.to_csv('XGBoost_submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
