<a href="https://www.kaggle.com/code/apinyacharoenchap/titanic-with-many-models?scriptVersionId=144075733" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

Thanks

[I have created my notebook from following this ](https://www.kaggle.com/code/vbmokin/titanic-0-83253-comparison-20-popular-models)


[I have used preprocessing from following this](https://www.kaggle.com/code/furduisorinoctavian/titanic-with-neural-networks-78)

# Table of Contents
1. Loading Data
2. Preprocessing Data
3. Train Model

In [None]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# models
from sklearn.linear_model import LogisticRegression, Perceptron, RidgeClassifier, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier

## 1.Loading Data

In [None]:
data_train = pd.read_csv("/kaggle/input/titanic/train.csv")
data_test = pd.read_csv("/kaggle/input/titanic/test.csv")
submission = pd.read_csv('../input/titanic/gender_submission.csv')
data_train.head()

## 2.Preprocessing Data 

Feature

1.PClass

2.Name

3.Sex

4.Age

5.Embarked

6. Parch

### Name

In [None]:
data_train["Title"] = data_train.Name.str.split(',').str[1].str.split('.').str[0].str.strip()
data_test["Title"] = data_test.Name.str.split(',').str[1].str.split('.').str[0].str.strip()

In [None]:
data_train['Title'].value_counts()

In [None]:
data_train['Title'] = data_train['Title'].replace('Mlle','Miss')
data_train['Title'] = data_train['Title'].replace('Mme','Mrs')

data_test['Title'] = data_test['Title'].replace('Mlle','Miss')
data_test['Title'] = data_test['Title'].replace('Mme','Mrs')

In [None]:
le = preprocessing.LabelEncoder()
data_train["Title"] = le.fit_transform(data_train["Title"])
data_test["Title"] = le.fit_transform(data_test["Title"])

### SEX

In [None]:
data_train['Sex'].value_counts()

In [None]:
le = preprocessing.LabelEncoder()
data_train["Sex"] = le.fit_transform(data_train["Sex"])
data_test["Sex"] = le.fit_transform(data_test["Sex"])
data_train.head(10)
# sex = 1 => male
# sex = 0 => female

### Age 

In [None]:
def ageCategory(age):
    if 0 <= age < 1:
        category = 0  # newborn 
    elif 1 <= age < 6:
        category = 1  # child 1
    elif 6 <= age < 12:
        category = 2  # child 2
    elif 12 <= age < 18:
        category = 3  # teenager
    elif 18 <= age < 30:
        category = 4  # adult 1
    elif 30 <= age < 45:
        category = 5  # adult 2
    elif 45 <= age < 65:
        category = 6  # adult 3
    else:
        category = 7  # old people
    return category


def addAgeCat(data):
    ageCategoryList = []
    for age in data["Age"]:
        ageCategoryList.append(ageCategory(age))
    return ageCategoryList


data_train["AgeCat"] = addAgeCat(data_train)
data_test["AgeCat"] = addAgeCat(data_test)
data_train.head(10)

### Embarked

In [None]:
data_train["Embarked"] = le.fit_transform(data_train["Embarked"])
data_test["Embarked"] = le.fit_transform(data_test["Embarked"])
data_train.head(10)
#s-->2
#c-->0
#Q-->1

In [None]:
data_train["Title"].value_counts()

# 3. Prepaing to the modelling

In [None]:
X = data_train[["Sex", "Title", "AgeCat", "Pclass", "Parch", "Embarked"]]
y = data_train["Survived"]

X_test = data_test[["Sex", "Title", "AgeCat", "Pclass", "Parch", "Embarked"]]

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Modelling

## 3.1 Logistic Regression 

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_valid).astype(int)

accuracy = accuracy_score(y_valid, y_pred)
print(accuracy)

## 0.75598 0.8156424581005587

In [None]:
y_prediction = logreg.predict(X_test).astype(int)

In [None]:
submission = pd.DataFrame({"PassengerId": submission["PassengerId"],"Survived": y_prediction})
submission.to_csv('/kaggle/working/submission_logreg.csv', index=False)

print("Submission saved")
## Accuracy = 0.75598 

## 3.2 Support Vector Machines 

In [None]:
svc = SVC()
svc.fit(X_train, y_train)
y_pred = logreg.predict(X_valid).astype(int)

accuracy = accuracy_score(y_valid, y_pred)
print(accuracy)

In [None]:
y_prediction = svc.predict(X_test).astype(int)

In [None]:
submission = pd.DataFrame({"PassengerId": submission["PassengerId"],"Survived": y_prediction})
submission.to_csv('/kaggle/working/submission_svm.csv', index=False)

print("Submission saved")

##0.77751

## 3.3 NN

In [None]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(15, 2), random_state=1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_valid).astype(int)

accuracy = accuracy_score(y_valid, y_pred)
print(accuracy)

In [None]:
y_prediction = clf.predict(X_test).astype(int)

In [None]:
submission = pd.DataFrame({"PassengerId": submission["PassengerId"],"Survived": y_prediction})
submission.to_csv('/kaggle/working/submission_NN.csv', index=False)

print("Submission saved")

##0.74401

## 3.4 Decision Tree

In [None]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_valid).astype(int)
accuracy = accuracy_score(y_valid, y_pred)
print(accuracy)

In [None]:
y_prediction = decision_tree.predict(X_test).astype(int)

In [None]:
submission = pd.DataFrame({"PassengerId": submission["PassengerId"],"Survived": y_prediction})
submission.to_csv('/kaggle/working/submission_DC.csv', index=False)

print("Submission saved")
## 0.643

## 3.5 Random Forest

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# Train the model on the training data
rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_valid).astype(int)
accuracy = accuracy_score(y_valid, y_pred)
print(accuracy)

In [None]:
y_prediction = rf_classifier.predict(X_test).astype(int)

In [None]:
submission = pd.DataFrame({"PassengerId": submission["PassengerId"],"Survived": y_prediction})
submission.to_csv('/kaggle/working/submission_RF.csv', index=False)

print("Submission saved")

## 3.6 Gaussian Naive Bayes

In [None]:
gaussian = GaussianNB()
gaussian.fit(X_train, y_train)

y_pred = gaussian.predict(X_valid).astype(int)
accuracy = accuracy_score(y_valid, y_pred)
print(accuracy)

In [None]:
y_prediction = gaussian.predict(X_test).astype(int)

In [None]:
submission = pd.DataFrame({"PassengerId": submission["PassengerId"],"Survived": y_prediction})
submission.to_csv('/kaggle/working/submission_NB.csv', index=False)

print("Submission saved")
## 0.39234

## 3.7 Ridge

In [None]:
ridge_classifier = RidgeClassifier()
ridge_classifier.fit(X_train, y_train)

y_pred = ridge_classifier.predict(X_valid).astype(int)
accuracy = accuracy_score(y_valid, y_pred)
print(accuracy)

In [None]:
y_prediction = ridge_classifier.predict(X_test).astype(int)

In [None]:
submission = pd.DataFrame({"PassengerId": submission["PassengerId"],"Survived": y_prediction})
submission.to_csv('/kaggle/working/submission_Ridge.csv', index=False)

print("Submission saved")
## 0.77272

## 3.8 Bagging Classifier

In [None]:
# Bagging Classifier

bagging_classifier = BaggingClassifier()
bagging_classifier.fit(X_train, y_train)

y_pred = bagging_classifier.predict(X_valid).astype(int)
accuracy = accuracy_score(y_valid, y_pred)
print(accuracy)

In [None]:
y_prediction = bagging_classifier.predict(X_test).astype(int)

In [None]:
submission = pd.DataFrame({"PassengerId": submission["PassengerId"],"Survived": y_prediction})
submission.to_csv('/kaggle/working/bagging.csv', index=False)

print("Submission saved")
## 0.77272

# Support vector is the best result (0.77751)

# 4 pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

steps = [
    ('classifier', SVC())  
]

pipeline = Pipeline(steps)

# กำหนดพารามิเตอร์ที่ต้องการทำ Grid Search
param_grid = {
    'classifier__C': [0.1,0.5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],          # ค่า C ใน SVM
    'classifier__kernel': ['linear', 'rbf'], # ประเภทของ kernel
    'classifier__gamma': ['scale', 'auto'],  # ค่า gamma ใน SVM
}

# สร้าง GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5)

# ฝึกโมเดลใน Pipeline และทำ Grid Search
grid_search.fit(X_train, y_train)

# ค่าพารามิเตอร์ที่ดีที่สุดหลังจาก Grid Search
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# ค่าความแม่นยำของโมเดลที่ดีที่สุด
best_score = grid_search.best_score_
print("Best Score (Accuracy):", best_score)

# ใช้โมเดลที่ดีที่สุดกับข้อมูลทดสอบ
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
