In [10]:
import pandas as pd

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [11]:
train_df = train_df.dropna(subset=["Survived"])
test_passenger_ids = test_df["PassengerId"]

for col in ["Pclass", "Age", "SibSp", "Fare", "Parch"]:
    train_df[col] = train_df[col].fillna(train_df[col].median())

for col in ["Sex", "Embarked"]:
    train_df[col] = train_df[col].fillna(train_df[col].mode()[0])

#Feature Engineering
#Getting titles of passengers
train_df["Title"] = train_df["Name"].str.extract(r",\s*([^\.]+)\.")
test_df["Title"] = test_df["Name"].str.extract(r",\s*([^\.]+)\.")

rare_titles = ["Lady", "Countess","Capt","Col","Don","Dr","Major","Rev","Sir","Jonkheer","Dona"]
train_df["Title"] = train_df["Title"].replace(rare_titles, "Rare")
test_df["Title"] = test_df["Title"].replace(rare_titles, "Rare")

#Calculating family size
train_df["FamilySize"] = train_df["SibSp"] + train_df["Parch"] + 1
test_df["FamilySize"] = test_df["SibSp"] + test_df["Parch"] + 1

train_df["IsAlone"] = (train_df["FamilySize"] == 1).astype(int)
test_df["IsAlone"] = (test_df["FamilySize"] == 1).astype(int)

#Determining age groups of passengers
bins = [0, 12, 18, 35, 60, 120]
labels = ['Child','Teen','YoungAdult','Adult','Senior']

train_df["AgeGroup"] = pd.cut(train_df["Age"], bins=bins, labels=labels)
test_df["AgeGroup"] = pd.cut(test_df["Age"], bins=bins, labels=labels)

#Determining fare bins of passengers
train_df["FareBin"] = pd.qcut(train_df["Fare"], 4, labels=[1, 2, 3, 4])
test_df["FareBin"] = pd.qcut(test_df["Fare"], 4, labels=[1, 2, 3, 4])

# Interaction features
train_df["Age_Fare"] = train_df["Age"] * train_df["Fare"]
train_df["Pclass_FamilySize"] = train_df["Pclass"] * train_df["FamilySize"]

test_df["Age_Fare"] = test_df["Age"] * test_df["Fare"]
test_df["Pclass_FamilySize"] = test_df["Pclass"] * test_df["FamilySize"]

cols_to_drop = ["PassengerId", "Name", "Ticket", "Cabin"]
train_df = train_df.drop(columns=[c for c in cols_to_drop if c in train_df.columns])
test_df = test_df.drop(columns=[c for c in cols_to_drop if c in test_df.columns])

In [12]:
#Determining the modes and medians of the train values to insert into the test values
age_median = train_df["Age"].median()
fare_median = train_df["Fare"].median()
sibsp_median = train_df["SibSp"].median()
parch_median = train_df["Parch"].median()
pclass_median = train_df["Pclass"].median()
sex_mode = train_df["Sex"].mode()[0]
embarked_mode = train_df["Embarked"].mode()[0]

#Inserting the modes and medians of the train values into the test values
test_df["Age"] = test_df["Age"].fillna(age_median)
test_df["Fare"] = test_df["Fare"].fillna(fare_median)
test_df["SibSp"] = test_df["SibSp"].fillna(sibsp_median)
test_df["Parch"] = test_df["Parch"].fillna(parch_median)
test_df["Pclass"] = test_df["Pclass"].fillna(pclass_median)
test_df["Sex"] = test_df["Sex"].fillna(sex_mode)
test_df["Embarked"] = test_df["Embarked"].fillna(embarked_mode)

test_df = test_df.drop(columns=[c for c in cols_to_drop if c in test_df.columns])

test_df.head(5)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,FamilySize,IsAlone,AgeGroup,FareBin,Age_Fare,Pclass_FamilySize
0,3,male,34.5,0,0,7.8292,Q,Mr,1,1,YoungAdult,1,270.1074,3
1,3,female,47.0,1,0,7.0,S,Mrs,2,0,Adult,1,329.0,6
2,2,male,62.0,0,0,9.6875,Q,Mr,1,1,Senior,2,600.625,2
3,3,male,27.0,0,0,8.6625,S,Mr,1,1,YoungAdult,2,233.8875,3
4,3,female,22.0,1,1,12.2875,S,Mrs,3,0,YoungAdult,2,270.325,9


In [13]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

num_features = ["Age", "SibSp", "Parch", "Fare", "FamilySize", "IsAlone", "Age_Fare", "Pclass_FamilySize"]
cat_features = ["Sex", "Embarked", "Title", "AgeGroup", "FareBin"]

num_transformer = Pipeline([
    ("scaler", StandardScaler()),
])

cat_transformer = Pipeline([
    ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore", drop="first"))
])

preprocessor = ColumnTransformer([
    ("num", num_transformer, num_features),
    ("cat", cat_transformer, cat_features),
])

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split

y = train_df["Survived"]
X = train_df.drop(columns=["Survived"])

pipeline = Pipeline([
    ("processor", preprocessor),
    ("model", RandomForestClassifier())
])

param_grid = {
    "model__n_estimators": [100, 200, 300],
    "model__min_samples_leaf": [1, 3, 5],
    "model__max_depth": [10, 20, 30],
    "model__min_samples_split": [3, 5, 8],
    "model__max_features": ["log2", "sqrt"]
}

grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring="accuracy"
)

In [15]:
grid.fit(X, y)
best_model = grid.best_estimator_
y_pred = best_model.predict(test_df)



In [17]:
submission = pd.DataFrame({
    "PassengerId": test_passenger_ids,
    "Survived": y_pred
})

submission.to_csv("submission.csv", index=False)