In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

In [3]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')

In [4]:
train['Age'] = train['Age'].fillna(train['Age'].mean())
train['Fare'] = train['Fare'].fillna(train['Fare'].mean())
train = train.drop(columns='Cabin')
train = train.dropna(subset=['Embarked'])

In [5]:
train = pd.get_dummies(train, columns=['Sex', 'Embarked'], drop_first=True)
train = train.drop(['Name', 'Ticket'], axis=1)

In [6]:
features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_male', 'Embarked_Q', 'Embarked_S']
target = ['Survived']

In [7]:
X = train[features]
y = train[target]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
gradient_boost = GradientBoostingClassifier(random_state=42)

In [10]:
gradient_boost.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [11]:
y_pred_gradient_boost = gradient_boost.predict(X_test)

In [12]:
accuracy_default = accuracy_score(y_test, y_pred_gradient_boost)
print("Accuracy of default Gradient Boosting model:", accuracy_default)

Accuracy of default Gradient Boosting model: 0.8146067415730337


In [13]:
param_grid_gradient_boost = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [None]:
grid_search_gradient_boost = GridSearchCV(GradientBoostingClassifier(random_state=42),
                                          param_grid=param_grid_gradient_boost,
                                          scoring='accuracy',
                                          cv=5,
                                          n_jobs=-1)
grid_search_gradient_boost.fit(X_train, y_train)

In [15]:
best_params_gradient_boost = grid_search_gradient_boost.best_params_
print('Best parameters for Gradient Boosting:', best_params_gradient_boost)

Best parameters for Gradient Boosting: {'learning_rate': 0.05, 'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 50}


In [16]:
best_gradient_boost = grid_search_gradient_boost.best_estimator_
print('Best estimator for Gradient Boosting:', best_gradient_boost)

Best estimator for Gradient Boosting: GradientBoostingClassifier(learning_rate=0.05, max_depth=5, min_samples_leaf=4,
                           min_samples_split=10, n_estimators=50,
                           random_state=42)


In [17]:
y_pred_gradient_boost_best = best_gradient_boost.predict(X_test)

In [19]:
test = pd.read_csv('/kaggle/input/titanic/test.csv')

In [20]:
test['Age'] = test['Age'].fillna(test['Age'].mean())
test['Fare'] = test['Fare'].fillna(test['Fare'].mean())

In [21]:
test = pd.get_dummies(test, columns=['Sex', 'Embarked'], drop_first=True)

In [22]:
X_test_final = test[features]
y_pred_test = best_gradient_boost.predict(X_test_final)

In [23]:
submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': y_pred_test})

In [24]:
accuracy_best = accuracy_score(y_test, y_pred_gradient_boost_best)
print("Accuracy of best Gradient Boosting model:", accuracy_best)

Accuracy of best Gradient Boosting model: 0.797752808988764


In [25]:
submission.to_csv('submission.csv', index=False)

In [26]:
print(submission.head())

   PassengerId  Survived
0          892         0
1          893         0
2          894         0
3          895         0
4          896         1
