<a href="https://colab.research.google.com/github/Elwing-Chou/ml1216/blob/main/titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from urllib.request import urlretrieve
url = "https://github.com/Elwing-Chou/ml1216/raw/main/train.csv"
urlretrieve(url, "train.csv")
url = "https://github.com/Elwing-Chou/ml1216/raw/main/test.csv"
urlretrieve(url, "test.csv")

In [None]:
import pandas as pd
train = pd.read_csv("train.csv", encoding="utf-8")
test = pd.read_csv("test.csv", encoding="utf-8")

In [None]:
datas = pd.concat([train, test], axis=0, ignore_index=True)
datas = datas.drop(["PassengerId", "Survived"], axis=1)
datas

In [None]:
s = datas.isna().sum()
# series[跟資料筆數依樣多的true/false]
s[s > 0].sort_values(ascending=False)

In [None]:
count = datas["Ticket"].value_counts() 
def share(tn):
    return count[tn]
datas["Ticket"] = datas["Ticket"].apply(share)

In [None]:
med = datas.median().drop("Pclass")
med

In [None]:
datas = datas.fillna(med)

In [None]:
most = datas["Embarked"].value_counts().idxmax()
datas["Embarked"] = datas["Embarked"].fillna(most)

In [None]:
def cabin(c):
    if pd.isna(c):
        return None
    else:
        return c[0]
datas["Cabin"] = datas["Cabin"].apply(cabin)

In [None]:
def name(n):
    n = n.split(",")[-1].split(".")[0]
    return n.strip()
count = datas["Name"].apply(name).value_counts()
# ['Mr', 'Miss', 'Mrs', 'Master']
reserved = count[:4].index
def name(n):
    n = n.split(",")[-1].split(".")[0]
    n = n.strip()
    if n in reserved:
        return n
    else:
        return None
datas["Name"] = datas["Name"].apply(name)

In [None]:
datas

In [None]:
datas = pd.get_dummies(datas)
datas = pd.get_dummies(datas, columns=["Pclass"])
datas

In [None]:
datas["Family"] = datas["SibSp"] + datas["Parch"]
datas

In [None]:
# iloc: [1st, 2nd, ....]
x_train = datas.iloc[:len(train)]
y_train = train["Survived"]
x_test = datas.iloc[len(train):]
x_test

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
params = {
   "n_estimators":range(15, 35),
   "max_depth":range(5, 9)
}
clf = RandomForestClassifier()
search = GridSearchCV(clf, params, n_jobs=-1, cv=10)
search.fit(x_train, y_train)
print(search.best_score_)
print(search.best_params_)

In [None]:
import numpy as np
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier(n_estimators=25, max_depth=7)
scores = cross_val_score(clf, x_train, y_train, n_jobs=-1, cv=10)
print(scores)
print(np.average(scores))

In [None]:
clf = RandomForestClassifier(n_estimators=25, max_depth=7)
clf.fit(x_train, y_train)
pre = clf.predict(x_test)
df = pd.DataFrame({
    "PassengerId":test["PassengerId"],
    "Survived":pre
})
df.to_csv("rf.csv", encoding="utf-8", index=False)

In [None]:
pd.DataFrame({
    "columns":datas.columns,
    "importances":clf.feature_importances_
}).sort_values(["importances"], ascending=False)

In [None]:
# https://seaborn.pydata.org/tutorial/color_palettes.html
import matplotlib.pyplot as plt
import seaborn as sns
sns.countplot(y=train["Sex"], hue=train["Survived"], palette="hls")

In [None]:
dead = train[train["Survived"] == 0]
alive = train[train["Survived"] != 0]
plt.figure(figsize=(6, 12))
plt.subplot(2, 1, 1)
sns.histplot(dead["Fare"], binrange=(0, 300), kde=True)
plt.subplot(2, 1, 2)
sns.histplot(alive["Fare"], binrange=(0, 600), kde=True)

In [None]:
def trans(s):
    s = str(s)
    return s.replace("(", "").replace("]", "").replace(", ", "-")
c = pd.cut(train["Fare"], bins=10)
c = c.apply(trans)
plt.figure(figsize=(10, 10))
p = sns.countplot(x=c, hue=train["Survived"])
plt.title("Fare")
plt.xticks(rotation=45)
p.legend(loc="upper right")

In [None]:
"from sklearn.tree import plot_tree
print(len(clf.estimators_))
plt.figure(figsize=(15, 15))
tree = plot_tree(clf.estimators_[3], 
         feature_names=datas.columns,
         class_names=["Dead", "Alive"],
         max_depth=2,
         filled=True)
plt.show()

In [None]:
from sklearn.preprocessing import MinMaxScaler
# iloc: [1st, 2nd, ....]
scaler = MinMaxScaler()
datas_scale = scaler.fit_transform(datas)
datas_scale = pd.DataFrame(datas_scale, columns=datas.columns)
x_train_scale = datas_scale.iloc[:len(train)]
x_test_scale = datas_scale.iloc[len(train):]
x_test_scale

In [None]:
from sklearn.neighbors import KNeighborsClassifier
params = {
   "n_neighbors":range(5, 100)
}
clf = KNeighborsClassifier()
search = GridSearchCV(clf, params, n_jobs=-1, cv=10)
search.fit(x_train_scale, y_train)
print(search.best_score_)
print(search.best_params_)

In [None]:
clf = KNeighborsClassifier(n_neighbors=11)
clf.fit(x_train_scale, y_train)
pre = clf.predict(x_test_scale)
df = pd.DataFrame({
    "PassengerId":test["PassengerId"],
    "Survived":pre
})
df.to_csv("knn.csv", encoding="utf-8", index=False)