# **1) Initiall instructions**

In [1]:
!pip install optuna
!pip install xgboost

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.6-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.6-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
import optuna
from xgboost import XGBClassifier
import os

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# **2) Data preprocess**

In [4]:
data_train = pd.read_csv('/content/drive/MyDrive/train_titanic.csv')
df_train = pd.DataFrame(data_train)



data_test = pd.read_csv('/content/drive/MyDrive/test_titanic.csv')
df_test = pd.DataFrame(data_test)
df_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [5]:
df_train = df_train.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin',])
df_test = df_test.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin',])

y = df_train['Survived']
X = df_train.drop(columns=['Survived'])

In [6]:
#Overview over NaN values within dataset
df_train.isna().sum()

Unnamed: 0,0
Survived,0
Pclass,0
Sex,0
Age,177
SibSp,0
Parch,0
Fare,0
Embarked,2


In [7]:
cat_cols = ['Sex', 'Embarked']
num_cols = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

In [8]:
cat_pipe = Pipeline(steps = [
    ('simpleimputer', SimpleImputer(strategy='most_frequent')),
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

num_pipe = Pipeline(steps = [
    ('simpleimputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [9]:
col_transformer = ColumnTransformer(transformers=[
    ('num_pipe', num_pipe, num_cols),
    ('cat_pipe', cat_pipe, cat_cols)
], remainder='drop', n_jobs=-1
)

# **3) Estimators evaluation**

In [10]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [11]:
list_of_models = [LogisticRegression(random_state=42), SVC(random_state=42), RandomForestClassifier(random_state=42), XGBClassifier(random_state=42)]
lr_acc = []
svm_acc = []
rf_acc = []
xgb_acc = []

for elem in list_of_models:
  model = elem
  pipefinal = make_pipeline(col_transformer, model)
  scores = cross_val_score(pipefinal, X, y, cv=skf).mean()

  if isinstance(model, LogisticRegression):
    lr_acc.append(scores)
  elif isinstance(model, SVC):
    svm_acc.append(scores)
  elif isinstance(model, RandomForestClassifier):
    rf_acc.append(scores)
  else:
    xgb_acc.append(scores)

In [12]:
print(f'Random Forest Classifier accuracy: {rf_acc[0]*100:.2f} %')
print(f'Support Vector Classifier accuracy: {svm_acc[0]*100:.2f} %')
print(f'XGBoost Classifier accuracy: {xgb_acc[0]*100:.2f} %')
print(f'Logistic Regression accuracy: {lr_acc[0]*100:.2f} %')

Random Forest Classifier accuracy: 81.82 %
Support Vector Classifier accuracy: 82.60 %
XGBoost Classifier accuracy: 81.70 %
Logistic Regression accuracy: 79.24 %


# **4) Optimization**

## **4) SVC optimazation**

In [13]:
def objective(trial):
    C = trial.suggest_float('C', 1e-3, 1e2)
    kernel = trial.suggest_categorical('kernel', ['linear', 'rbf', 'sigmoid'])
    gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])

    model = SVC(C=C,
                kernel=kernel,
                gamma=gamma,
                random_state=42
          )

    pipefinal = make_pipeline(col_transformer, model)
    score = cross_val_score(pipefinal, X, y, cv=skf, scoring='accuracy').mean()

    return score

In [14]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2024-11-17 19:16:02,286] A new study created in memory with name: no-name-1403ba7c-01f0-4661-b1c8-2352eb127123
[I 2024-11-17 19:16:02,855] Trial 0 finished with value: 0.6666813131630154 and parameters: {'C': 71.37367180183394, 'kernel': 'sigmoid', 'gamma': 'scale'}. Best is trial 0 with value: 0.6666813131630154.
[I 2024-11-17 19:16:03,510] Trial 1 finished with value: 0.8069612704789405 and parameters: {'C': 62.21728177327482, 'kernel': 'rbf', 'gamma': 'auto'}. Best is trial 1 with value: 0.8069612704789405.
[I 2024-11-17 19:16:04,092] Trial 2 finished with value: 0.8080848659845584 and parameters: {'C': 63.62905685613583, 'kernel': 'rbf', 'gamma': 'auto'}. Best is trial 2 with value: 0.8080848659845584.
[I 2024-11-17 19:16:05,688] Trial 3 finished with value: 0.7867553825874082 and parameters: {'C': 74.5808137441816, 'kernel': 'linear', 'gamma': 'auto'}. Best is trial 2 with value: 0.8080848659845584.
[I 2024-11-17 19:16:06,045] Trial 4 finished with value: 0.6700520996798695 and

In [15]:
best_params = study.best_params

best_model = SVC(
    C = best_params['C'],
    kernel = best_params['kernel'],
    gamma = best_params['gamma']
)

In [16]:
best_pipefinal = make_pipeline(col_transformer, best_model)
best_pipefinal.fit(X, y)

In [17]:
prediction = best_pipefinal.predict(df_test)
list_of_entries = range(892, len(y) + 892)

out_df = pd.DataFrame(data=zip(list_of_entries, prediction),
                   columns = ['PassengerId' , 'Survived'])

filepath = os.path.join(os.getcwd(), 'titanic_submission_pipeline_svm.csv')
out_df.to_csv(filepath, index=False)