In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import cross_val_score
from itertools import product

sns.set(style="ticks", context="talk")
plt.style.use("dark_background")

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data.loc[train_data["Embarked"].isna(), "Embarked"] = "N"
train_data["Embarked_S"] = (train_data["Embarked"] == "S").astype(int)
train_data["Embarked_C"] = (train_data["Embarked"] == "C").astype(int)
train_data["Embarked_Q"] = (train_data["Embarked"] == "Q").astype(int)
train_data["Embarked_N"] = (train_data["Embarked"] == "N").astype(int)
train_data = train_data.drop("Embarked", axis = 1)
train_data.head()

In [None]:
# for col in ["Pclass", "Sex", "SibSp", "Parch"]:
#     sns.histplot(data=train_data, x=col, hue="Survived", multiple="dodge")
#     plt.show()

In [None]:
# sns.boxplot(data=train_data[train_data["Fare"]<400], x="Survived", y="Fare")

In [None]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.loc[test_data["Embarked"].isna(), "Embarked"] = "N"
test_data["Embarked_S"] = (test_data["Embarked"] == "S").astype(int)
test_data["Embarked_C"] = (test_data["Embarked"] == "C").astype(int)
test_data["Embarked_Q"] = (test_data["Embarked"] == "Q").astype(int)
test_data["Embarked_N"] = (test_data["Embarked"] == "N").astype(int)
test_data = test_data.drop("Embarked", axis = 1)
test_data.head()

In [None]:
def na_s(dataset):
    for col in dataset.columns:
        n_nas = dataset[col].isna().sum()
        if n_nas > 0:
            print(f"{col}: {round(100 * n_nas / dataset.shape[0], 2)}%")

In [None]:
na_s(train_data)

In [None]:
na_s(test_data)

In [None]:
y = train_data["Survived"]

features = ["Age", "Pclass", "Sex", "SibSp", "Parch", "Fare", "Embarked_S", "Embarked_N", "Embarked_Q", "Embarked_C"]

In [None]:
X = pd.get_dummies(train_data[features])
scaler = MinMaxScaler().fit(X)
X = scaler.transform(np.array(X))

imputer = KNNImputer(n_neighbors=5).fit(X)
X = imputer.transform(X)
# print(X.shape)

In [None]:
# depths = np.arange(5, 11, 1)
# n_estims = np.arange(50, 150, 10)
# combos = list(product(depths, n_estims))
# scorez = np.zeros(len(combos))

# for i in range(len(combos)):
    
#     depth = combos[i][0]
#     n_est = combos[i][1]
#     clf = RandomForestClassifier(n_estimators=n_est, max_depth=depth, random_state=1)
#     scores = cross_val_score(clf, X, y, cv=10)
#     scorez[i] = scores.mean()
#     print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
#     print(f"depth = {depth} ; # estim = {n_est} ;")

# idx = np.where(scorez == max(scorez))[0].item()
# print(idx)
# f_combo = combos[idx]
# print(f_combo)

# f_combo = (8, 90)
# model = RandomForestClassifier(n_estimators=f_combo[1], max_depth=f_combo[0], random_state=1)

In [None]:
# # n_estims = np.arange(60, 200, 20)
# n_estims = np.arange(5, 100, 5)
# # l_rates = np.array([0.5, .1, .075, .05, .025, .01, .0075, .005, .0025, .001])
# l_rates = np.arange(.05, .5, .05)

# combos = list(product(n_estims, l_rates))
# scorez = np.zeros(len(combos))

# for i in range(len(combos)):
    
#     l_rate = combos[i][1]
#     n_est = combos[i][0]
#     clf = GradientBoostingClassifier(learning_rate=l_rate, n_estimators=n_est)
#     scores = cross_val_score(clf, X, y, cv=10, scoring="roc_auc")
#     scorez[i] = scores.mean()
#     print("%0.2f ROCAUC with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
#     print(f"l rate = {l_rate}; # estim = {n_est};")

# idx = np.where(scorez == max(scorez))[0].item()
# print(idx)
# f_combo = combos[idx]
# print(f_combo)

f_combo = (40, 0.2)
model = GradientBoostingClassifier(learning_rate=f_combo[1], n_estimators=f_combo[0])

In [None]:
X_test = pd.get_dummies(test_data[features])
X_test = scaler.transform(np.array(X_test))
X_test = imputer.transform(X_test)
# print(sum(np.isnan(X_test)))

In [None]:
print(X.shape)
print(X_test.shape)

In [None]:
model.fit(X, y)
predictions = model.predict(X_test)

In [None]:
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission_titanic.csv', index=False)
print("Your submission was successfully saved!")