In [None]:
!pip install catboost

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import joblib

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from catboost import CatBoostClassifier
from xgboost import XGBClassifier

from sklearn.metrics import roc_curve, roc_auc_score

In [None]:
training_file = pd.read_csv('filepath/train.csv')

df = pd.DataFrame(training_file)
df.drop(['Name',
         'Embarked',
         'Ticket',
         'Cabin'], axis = 'columns', inplace=True)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,1,0,3,male,22.0,1,0,7.2500
1,2,1,1,female,38.0,1,0,71.2833
2,3,1,3,female,26.0,0,0,7.9250
3,4,1,1,female,35.0,1,0,53.1000
4,5,0,3,male,35.0,0,0,8.0500
...,...,...,...,...,...,...,...,...
886,887,0,2,male,27.0,0,0,13.0000
887,888,1,1,female,19.0,0,0,30.0000
888,889,0,3,female,,1,2,23.4500
889,890,1,1,male,26.0,0,0,30.0000


In [None]:
encoder = LabelEncoder()
df['sex_encoded'] = encoder.fit_transform(df['Sex'])
df.drop(['Sex'], axis='columns', inplace=True)
data = df.drop(['PassengerId', 'Survived'], axis='columns')
target = df['Survived']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.3, random_state=42)

In [None]:
model_params = {
    'CatBoostClassifier': {
        'model': CatBoostClassifier(),
        'params': {
            'iterations': [250, 500, 1000, 1500],
            'learning_rate': [0.001, 0.05, 0.01, 0.1, 0.2],
            'depth': [4,6,8,10],
            'l2_leaf_reg' : [5,10],
            'verbose' : [False],

        }
    },
    'XGBClassifier': {
        'model': XGBClassifier(),
        'params': {
            'n_estimators': [50, 100, 250, 500],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3,6,9],
            'verbosity' : [0],
        }
    }
}

In [None]:
def train_and_save_models(x_train, y_train):
    for model_name, mp in model_params.items():
        clf = GridSearchCV(mp['model'], mp['params'], cv=3, return_train_score=False)
        clf.fit(x_train, y_train)

        joblib.dump(clf.best_estimator_, f'{model_name}_best_model.joblib')
        y_scores = clf.predict_proba(x_train)[:, 1]
        auc_score = roc_auc_score(y_train, y_scores)
        print(f"Model: {model_name}, Best Score: {clf.best_score_:.2f}, Best Params: {clf.best_params_}, AUC (Train): {auc_score:.2f}")

def validate_models(x_test, y_test):
    for model_name in model_params.keys():
        model = joblib.load(f'{model_name}_best_model.joblib')

        y_scores = model.predict_proba(x_test)[:, 1]
        auc_score = roc_auc_score(y_test, y_scores)
        print(f"Model: {model_name}, AUC (Test): {auc_score:.2f}")

In [None]:
    print('Training set:')
    train_and_save_models(X_train, y_train)

    print()
    print('Validation set:')
    validate_models(X_test, y_test)

Training set:
Model: CatBoostClassifier, Best Score: 0.83, Best Params: {'depth': 4, 'iterations': 1500, 'l2_leaf_reg': 10, 'learning_rate': 0.01, 'verbose': False}, AUC (Train): 0.94
Model: XGBClassifier, Best Score: 0.83, Best Params: {'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 500, 'verbosity': 0}, AUC (Train): 0.96

Validation set:
Model: CatBoostClassifier, AUC (Test): 0.89
Model: XGBClassifier, AUC (Test): 0.87


In [None]:
testing_file = pd.read_csv('filepath/test.csv')
df_task = pd.DataFrame(testing_file)
df_task.drop(['Name',
         'Embarked',
         'Ticket',
         'Cabin'], axis = 'columns', inplace=True)

df_task['sex_encoded'] = encoder.fit_transform(df_task['Sex'])
df_task.drop(['Sex'], axis='columns', inplace=True)
data_task = df_task.drop(['PassengerId'], axis='columns')

In [None]:
model_name = 'CatBoostClassifier'
model = joblib.load(f'{model_name}_best_model.joblib')
predictions = model.predict(data_task)
print(predictions)

[0 0 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 1 1 0 1 0
 1 0 0 1 0 1 1 0 0 0 0 0 1 0 1 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 1 0 0 1 1 0 1 1 1 1 0 0 1 0 0 1 0 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 0 0 1 0 1 1 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 1 0 0 0 1 0 1 0 0 0 0 1 1 0 1 0 0 0 1 0
 0 1 0 0 1 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 1 1 1 0 1 0 0 0]


In [None]:
print(len(model.predict(data_task)))

418


In [None]:
df_final = pd.DataFrame(data=model.predict(data_task),
                        columns=['Survived']
)
df_final['PassengerID'] = np.arange(892,1310)
cols = df_final.columns.to_list()
df_final = df_final[cols[::-1]]
path = 'filepath/submision.csv'
df_final.to_csv(path, index=False)