# 1. Initial setup

### Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### seed for reproducibility

In [None]:
SEED = 42

# 2. Data preprocessing

### Load the Titanic dataset

In [None]:
data_link = "https://raw.githubusercontent.com/20161609/data_box/c8bbf4888f31bc53672161471eed3855fb76744a/titanic.csv"
df = pd.read_csv(data_link)
df.head()

In [None]:
# Replace ' ' -> '_'
df.columns = df.columns.str.replace(' ', '_')
list(df.columns)

In [None]:
# Select only numeric columns for correlation calculation
numeric_df = df.select_dtypes(include=[np.number])

# Calculate correlations with the target variable 'Survived'
correlations = numeric_df.corr()  # Calculate correlation matrix
print(correlations['Survived'].sort_values(ascending=False))  # Sort correlations with 'Survived'


In [None]:
# df = df[['Survived', 'Age', 'Sex', 'Pclass']]
df = df[['Survived', 'Fare', 'Sex', 'Pclass']]
df = pd.get_dummies(df, columns=['Sex', 'Pclass'])
df.dropna(inplace=True)
df.head()

In [None]:
df['Survived'].value_counts()

In [None]:
sns.countplot(data=df, x='Survived')

In [None]:
# Sum of rows which even 1 col's value is None
df.isnull().sum(axis=1).sum()

In [None]:
df.duplicated().sum()

In [None]:
# Draw boxplot
df.boxplot(figsize=(10,10))

In [None]:
df.describe().T.head()

In [None]:
from sklearn.model_selection import train_test_split

x = df.drop('Survived', axis=1)
y = df['Survived']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=0)

In [None]:
from sklearn.svm import SVC

model = SVC(probability=True, random_state=0)
model.fit(x_train, y_train)

In [None]:
model.score(x_test, y_test)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import contextlib
import io

# Define the model
model = SVC(probability=True, random_state=0)

# Define the parameter grid
param_grid = {
    'C': [0.1, 0.5, 1, 5, 10],
    'gamma': [0.1, 0.5, 1, 5, 10],
    'kernel': ['linear', 'rbf', 'sigmoid']
}

# Suppress verbose output
with contextlib.redirect_stdout(io.StringIO()):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, verbose=2)
    grid_search.fit(x, y)  # Train the model

# Get the model with the best performance
best_model = grid_search.best_estimator_


In [None]:
print(grid_search.best_params_)

In [None]:
from sklearn.model_selection import cross_validate

scores = cross_validate(best_model, x, y, cv=5)
print(scores['test_score'].mean())