In [1]:
!pip install pandas
!pip install numpy
!pip install scikit-learn
!pip install catboost
!pip install optuna

Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7
Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.1-py3-none-any.whl (233 kB)
[2

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from catboost import CatBoostClassifier
import optuna

In [3]:
! mkdir ~/.kaggle

In [4]:
!cp /content/drive/MyDrive/CollabData/kaggle_API/kaggle.json ~/.kaggle/kaggle.json

In [5]:
! chmod 600 ~/.kaggle/kaggle.json

In [6]:
! kaggle competitions download titanic

Downloading titanic.zip to /content
  0% 0.00/34.1k [00:00<?, ?B/s]
100% 34.1k/34.1k [00:00<00:00, 15.8MB/s]


In [7]:
! unzip titanic.zip

Archive:  titanic.zip
  inflating: gender_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [8]:
# Load the data
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')

# Preprocessing
def preprocess_data(df):
    # Drop irrelevant columns
    df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

    # Fill missing values
    df['Age'].fillna(df['Age'].median(), inplace=True)
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

    # One-hot encoding for categorical features
    df = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=True)

    return df

train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)

# Separate features and target
X = train_df.drop('Survived', axis=1)
y = train_df['Survived']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we

In [12]:
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1e-1, log=True),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_strength': trial.suggest_float('random_strength', 1e-9, 10.0, log=True),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 10.0),
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']), # Changed 'EBS' to 'Iter'
        'od_wait': trial.suggest_int('od_wait', 10, 50),
        'verbose': False  # Set verbose to False to suppress training output
    }

    model = CatBoostClassifier(**params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=100)  # Add early stopping

    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)

    return accuracy  # Optuna maximizes accuracy

In [13]:
# Create and run an Optuna study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)  # Increased number of trials

# Get the best hyperparameters
best_params = study.best_params

[I 2025-02-12 18:12:08,109] A new study created in memory with name: no-name-295f6a0b-8229-4cbe-9b4b-0c3a8d0eb33f
[I 2025-02-12 18:12:08,392] Trial 0 finished with value: 0.8044692737430168 and parameters: {'iterations': 318, 'learning_rate': 0.034481464697705064, 'depth': 7, 'l2_leaf_reg': 1.1994797933979945e-06, 'border_count': 223, 'random_strength': 0.43174503180082524, 'bagging_temperature': 6.46869503911912, 'od_type': 'Iter', 'od_wait': 49}. Best is trial 0 with value: 0.8044692737430168.
[I 2025-02-12 18:12:08,578] Trial 1 finished with value: 0.7877094972067039 and parameters: {'iterations': 210, 'learning_rate': 0.0759528981379955, 'depth': 7, 'l2_leaf_reg': 0.0008824337108021507, 'border_count': 149, 'random_strength': 0.18994229501442617, 'bagging_temperature': 4.980257872269012, 'od_type': 'Iter', 'od_wait': 34}. Best is trial 1 with value: 0.7877094972067039.
[I 2025-02-12 18:12:09,772] Trial 2 finished with value: 0.8100558659217877 and parameters: {'iterations': 742, 'l

In [14]:
# Model Training and Prediction
model = CatBoostClassifier(**best_params, verbose=False)  # Use best hyperparameters
model.fit(X_train, y_train)

predictions = model.predict(test_df)

In [15]:
# Evaluate accuracy on validation set
val_predictions = model.predict(X_val)

# Calculate RMSE without 'squared' argument and take the square root manually
rmse = mean_squared_error(y_val, val_predictions)**0.5

print(f"Validation RMSE: {rmse}")

Validation RMSE: 0.43582580703557733


In [17]:
# Create submission DataFrame with 'PassengerId' from original test data
submission_df = pd.DataFrame({'PassengerId': pd.read_csv('/content/test.csv')['PassengerId'], 'Survived': predictions})
submission_df.to_csv('submission.csv', index=False)