<a href="https://colab.research.google.com/github/Elwing-Chou/acerml/blob/main/titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from urllib.request import urlretrieve
url = "https://github.com/Elwing-Chou/acerml/raw/main/titanic/train.csv"
urlretrieve(url, "train.csv")
url = "https://github.com/Elwing-Chou/acerml/raw/main/titanic/test.csv"
urlretrieve(url, "test.csv")

In [None]:
import pandas as pd
train_df = pd.read_csv("train.csv", encoding="utf-8")
test_df = pd.read_csv("test.csv", encoding="utf-8")

In [None]:
datas = pd.concat([train_df, test_df], axis=0, ignore_index=True)
datas = datas.drop(["PassengerId", "Survived"], axis=1)

In [None]:
# Series[True/False list]
s = datas.isna().sum()
s[s > 0].sort_values(ascending=False)

In [None]:
# 類別: 最常出現
most = datas["Embarked"].value_counts().idxmax()
datas["Embarked"] = datas["Embarked"].fillna(most)

In [None]:
# 數值: 中位數
med = datas.median().drop(["Pclass"])
datas = datas.fillna(med)
s = datas.isna().sum()
s[s > 0].sort_values(ascending=False)

In [None]:
def cabinflow(c):
    if pd.isna(c):
        return c
    else:
        return c[0]
datas["Cabin"] = datas["Cabin"].apply(cabinflow)
datas["Cabin"].value_counts()

In [None]:
dic = datas["Ticket"].value_counts()
def ticketflow(t):
    return dic[t]
datas["Ticket"] = datas["Ticket"].apply(ticketflow)
datas["Ticket"].value_counts()

In [None]:
def nameflow(n):
    mid = n.split(".")[0].split(",")[-1]
    return mid.strip()

middle = datas["Name"].apply(nameflow).value_counts()
whitelist = middle.index[:4]

def nameflow2(n):
    mid = nameflow(n)
    if mid in whitelist:
        return mid
    else:
        return None
datas["Name"] = datas["Name"].apply(nameflow2)
datas["Name"].value_counts()

In [None]:
# One-Hot Encoding
datas = pd.get_dummies(datas)
datas = pd.get_dummies(datas, columns=["Pclass"])

In [None]:
datas["Family"] = datas["SibSp"] + datas["Parch"]

In [None]:
# 列: iloc(列編號)[通常0,1,2...], loc(列標籤)
# df.iloc -> [1st, 2nd...]
x_train = datas.iloc[:len(train_df)]
x_test = datas.iloc[len(train_df):]
y_train = train_df["Survived"]

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
clf = RandomForestClassifier()
params = {
    "max_depth":[7],
    "n_estimators":range(20, 100)
}
cv = GridSearchCV(clf, params, cv=10, n_jobs=-1)
cv.fit(x_train, y_train)
print(cv.best_score_)
print(cv.best_params_)

In [None]:
import numpy as np
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier(n_estimators=75,
                             max_depth=6)
scores = cross_val_score(clf, x_train, y_train, cv=10, n_jobs=-1)
print("10 times:", scores)
print("average:", np.average(scores))

In [None]:
clf = RandomForestClassifier(n_estimators=75,
                             max_depth=6)
clf.fit(x_train, y_train)
pre = clf.predict(x_test)
result = pd.DataFrame({
    "PassengerId":test_df["PassengerId"],
    "Survived":pre
})
result.to_csv("rf.csv", encoding="utf-8", index=False)
result

In [None]:
pd.DataFrame({
    "columns":datas.columns,
    "importance":clf.feature_importances_
})

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.countplot(train_df["Survived"], hue=train_df["Sex"])

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
datas_scale = scaler.fit_transform(datas)
datas_scale = pd.DataFrame(datas_scale, columns=datas.columns)
x_train_scale = datas_scale.iloc[:len(train_df)]
x_test_scale = datas_scale.iloc[len(train_df):]

In [None]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
params = {
    "n_neighbors":range(5, 200)
}
cv = GridSearchCV(clf, params, cv=10, n_jobs=-1)
cv.fit(x_train_scale, y_train)
print(cv.best_score_)
print(cv.best_params_)

In [None]:
clf = KNeighborsClassifier(n_neighbors=11)
clf.fit(x_train_scale, y_train)
pre = clf.predict(x_test_scale)
result = pd.DataFrame({
    "PassengerId":test_df["PassengerId"],
    "Survived":pre
})
result.to_csv("knn.csv", encoding="utf-8", index=False)
result