In [14]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/breast-cancer/Breast_Cancer.csv


Load the training, validation and test datasets:

In [13]:
from sklearn.model_selection import train_test_split

# Read the data
X_full = pd.read_csv('../input/Titanic - Machine Learning from Disaster/train.csv', index_col='PassengerId')
X_test_full = pd.read_csv('../input/Titanic - Machine Learning from Disaster/test.csv', index_col='PassengerId')

# Remove rows with missing target, separate target from predictors
X_full.dropna(axis=0, subset=['Survived'], inplace=True)
y = X_full.Survived
X_full.drop(['Survived'], axis=1, inplace=True)

# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, train_size=0.8, test_size=0.2, random_state=0)

# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if
                    X_train_full[cname].nunique() < 10 and 
                    X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if 
                X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

FileNotFoundError: [Errno 2] No such file or directory: '../input/Titanic - Machine Learning from Disaster/train.csv'

Data analysys:

In [None]:
X_train

In [None]:
X_train.describe()

In [None]:
X_train.isna().sum()

Data preprocessing (with imputation = BEST):

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='mean')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define a model
model = RandomForestClassifier(n_estimators=100, random_state=0)

# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

# Preprocessing of training data, fit model 
clf.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
y_pred = clf.predict(X_valid)

print('Validation accuracy:', accuracy_score(y_valid, y_pred))

Gradient Boosting:


In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# Define a model (with the best hyperparameters defined by GridSearchCV)
model = XGBClassifier()

# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

# Param grid
param_grid = {
    'model__max_depth': [3, 4, 5],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__subsample': [0.7, 0.8, 0.9],
    'model__colsample_bytree': [0.7, 0.8, 0.9]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    scoring='accuracy',
    cv=3,
    verbose=1,
    n_jobs=-1
)

# Preprocessing of training data, fit model 
grid_search.fit(X_train, y_train, 
        model__early_stopping_rounds=5,
        model__eval_set=[(preprocessor.transform(X_valid), y_valid)],
        model__verbose=True)

# Evaluate the best model on the validation set
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_valid)


# Fit the data with hyperparameter-optimized model
clf.fit(X_train, y_train, 
        model__early_stopping_rounds=5,
        model__eval_set=[(preprocessor.transform(X_valid), y_valid)],
        model__verbose=True)

# Make predictions
y_pred = clf.predict(X_valid)
print('Validation accuracy:', accuracy_score(y_valid, y_pred))

(XGBClassifier best params n_estimators=1000,
                      max_depth=5,
                      learning_rate=0.2,
                      subsample=0.8,
                      colsample_bytree=0.8,
                      random_state=0)

Generate Test results:

In [None]:
# Get test results
preds_test = clf.predict(X_test)

# Save test predictions to file
output = pd.DataFrame({'PassengerId': X_test.index,
                       'Survived': preds_test})
output.to_csv('titanic.csv', index=False)