In [None]:
import pandas as pd
import numpy as np
np.random.seed(42)

train_set, test_set = pd.read_csv("dataset/train.csv"), pd.read_csv("dataset/test.csv")

In [None]:
train_set = train_set.sample(frac = 1, random_state=42)
test_set

In [None]:
train_set.hist(bins = 50, figsize = (12, 8))

In [None]:
from pandas.plotting import scatter_matrix

scatter_matrix(train_set, figsize = (12, 8))

In [None]:
corr_matrix = train_set.corr()
corr_matrix["Survived"]

In [None]:
train_set_2 = train_set.copy()
train_set_2["class per fare"] = train_set["Pclass"] / train_set["Fare"]
train_set_2["class per age"] = train_set["Pclass"] / train_set["Age"]
train_set_2.corr()["Survived"]
#lower class and older more likely to survive
#Lower class and high payment more likely for death (basically class)
#lower class more likely to die (3)

In [None]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

encoder = OneHotEncoder(sparse = False)
train_cat = train_set[["Sex"]]
train_cat_prep = encoder.fit_transform(train_cat)

In [None]:
new = pd.DataFrame(train_cat_prep, columns = encoder.get_feature_names_out())

In [None]:
train_ratio = train_set.copy()
train_ratio.drop("Sex", axis = 1)
train_ratio["Sex_female"] = new.loc[:,"Sex_female"]
train_ratio["Sex_male"] = new.loc[:,"Sex_male"]

In [None]:
train_ratio.corr()["Survived"]
#males survive more than females

In [None]:
x_train, y_train = train_set.drop("Survived", axis = 1), train_set["Survived"]

In [None]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

def replace(x):
    np.log(x)
    x[x == -np.inf] = 0
    return x

num_pipeline = make_pipeline(SimpleImputer(strategy = "median"), StandardScaler())
log_pipeline = make_pipeline(SimpleImputer(strategy = "median"), FunctionTransformer(replace, feature_names_out = "one-to-one"),
                            StandardScaler())
cat_pipeline = make_pipeline(OrdinalEncoder(), SimpleImputer(strategy = "most_frequent"), OneHotEncoder())

In [None]:
from sklearn.compose import ColumnTransformer, make_column_selector

def column_ratio(x):
    return x[:, [0]] / x[:, [1]]

def ratio_name(function_transformer, feature_names_in):
    return ["ratio"]

def ratio_pipeline():
    return make_pipeline(SimpleImputer(strategy = "median"), FunctionTransformer(column_ratio, feature_names_out = ratio_name), StandardScaler())

preprocessing = ColumnTransformer([
                                   ("cat", cat_pipeline, ["Sex", "Embarked"]),
                                  ("log", log_pipeline, ["Age", "Fare"]),
                                   ("class_per_age", ratio_pipeline(), ["Pclass", "Age"]),
                                 ("num", num_pipeline, make_column_selector(dtype_include = np.number))])

In [None]:
x_train_prep = preprocessing.fit_transform(x_train)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors = 11, leaf_size = 30)
y_train_prep = y_train == 1
model.fit(x_train_prep, y_train_prep)

In [None]:
model.predict(x_train_prep)

In [None]:
from sklearn.model_selection import cross_val_score

kn_scores = cross_val_score(model, x_train_prep, y_train_prep, cv = 10)

In [None]:
kn_scores.mean()

In [None]:
#svc
from sklearn.svm import SVC
svm_clf = SVC(gamma = "auto")
svm_scores = cross_val_score(svm_clf, x_train_prep, y_train_prep, cv = 10)
svm_scores.mean()

In [None]:
preprocessing.get_feature_names_out()

In [None]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators = 100, random_state = 42)
forest_clf.fit(x_train_prep, y_train_prep)

In [None]:
forest_scores = cross_val_score(forest_clf, x_train_prep, y_train_prep, cv = 10)
forest_scores.mean()

In [None]:
preprocess = Pipeline([("fit", preprocessing), ("forest", RandomForestClassifier(n_estimators = 100, random_state = 42))])

In [None]:
test_set_prep = preprocessing.fit_transform(test_set)
test_set_prep.shape

In [None]:
predictions = forest_clf.predict(test_set_prep)
predictions = predictions.astype(int)

In [None]:
submission = pd.DataFrame({"PassengerId": test_set["PassengerId"], "Survived": predictions})
submission.head()