In [None]:
!pip install pandas==2.0.3
!pip install numpy==1.25.2
!pip install scikit-learn==1.3.0
!pip install catboost==1.2.2
!pip install optuna==3.2.0

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.1-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.6/233.6 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.9-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
import optuna

In [None]:
! mkdir ~/.kaggle

In [None]:
!cp /content/drive/MyDrive/CollabData/kaggle_API/kaggle.json ~/.kaggle/kaggle.json

In [None]:
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
! kaggle competitions download titanic

Downloading home-data-for-ml-course.zip to /content
  0% 0.00/386k [00:00<?, ?B/s]
100% 386k/386k [00:00<00:00, 62.5MB/s]


In [None]:
! unzip titanic.zip

Archive:  home-data-for-ml-course.zip
  inflating: data_description.txt    
  inflating: sample_submission.csv   
  inflating: sample_submission.csv.gz  
  inflating: test.csv                
  inflating: test.csv.gz             
  inflating: train.csv               
  inflating: train.csv.gz            


In [None]:
# Load the data
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')

# Preprocessing
def preprocess_data(df):
    # Drop irrelevant columns
    df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

    # Fill missing values
    df['Age'].fillna(df['Age'].median(), inplace=True)
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

    # One-hot encoding for categorical features
    df = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=True)

    return df

train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)

# Separate features and target
X = train_df.drop('Survived', axis=1)
y = train_df['Survived']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1e-1, log=True),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_strength': trial.suggest_float('random_strength', 1e-9, 10.0, log=True),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 10.0),
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'EBS']),
        'od_wait': trial.suggest_int('od_wait', 10, 50),
        'verbose': False  # Set verbose to False to suppress training output
    }

    model = CatBoostClassifier(**params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=100)  # Add early stopping

    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)

    return accuracy  # Optuna maximizes accuracy

In [None]:
# Create and run an Optuna study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)  # Increased number of trials

# Get the best hyperparameters
best_params = study.best_params

[I 2025-02-12 17:06:22,968] A new study created in memory with name: no-name-b6917131-2e33-4958-9f1a-5ff31db63e3f


In [None]:
# Model Training and Prediction
model = CatBoostClassifier(**best_params, verbose=False)  # Use best hyperparameters
model.fit(X_train, y_train)

predictions = model.predict(test_df)

In [None]:
# Evaluate accuracy on validation set
val_predictions = final_model.predict(X_val_processed)

# Calculate RMSE without 'squared' argument and take the square root manually
rmse = mean_squared_error(y_val, val_predictions)**0.5

print(f"Validation RMSE: {rmse}")

Validation RMSE: 23627.352285010693


In [None]:
#submission_df = pd.DataFrame({'Id': test_df['Id'], 'SalePrice': predictions})
#submission_df.to_csv('submission.csv', index=False)