In [None]:
import numpy as np
import pandas as pd

from sklearn.metrics import mean_absolute_error, accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")

#cleaning up data for processing
columns_to_drop = ['Name', 'Ticket', 'Cabin']
embarked_map = {'S': 0, 'C': 1, 'Q': 2}

train_data.drop(columns=columns_to_drop, axis=1, inplace=True)

age_mean = train_data['Age'].mean()
train_data['Age'] = train_data['Age'].fillna(age_mean)

embarked = train_data['Embarked'].mode()[0]
train_data['Embarked'] = train_data['Embarked'].fillna(embarked)

train_data = pd.get_dummies(train_data, columns=['Sex', 'Embarked'])

train_data.head

In [None]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

test_data.drop(columns=columns_to_drop, axis=1, inplace=True)

test_data['Age'] = test_data['Age'].fillna(age_mean)

test_data['Embarked'] = test_data['Embarked'].fillna(embarked)

test_data  = pd.get_dummies(test_data,  columns=['Sex', 'Embarked'])

fare_mean = train_data['Fare'].mean()
test_data['Fare'] = test_data['Fare'].fillna(fare_mean)

train_data, test_data = train_data.align(test_data, join='left', axis=1, fill_value=0)

In [None]:
features = [col for col in train_data.columns if col not in ['PassengerId','Survived']]

y = train_data["Survived"]

X = train_data[features]

train_X, val_X, train_Y, val_Y = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

def eval_dt(max_leaf_nodes):
    model = DecisionTreeClassifier(max_leaf_nodes=max_leaf_nodes, random_state=42)
    model.fit(train_X, train_Y)
    preds = model.predict(val_X)
    return accuracy_score(val_Y, preds)

def eval_rf(max_leaf_nodes):
    model = RandomForestClassifier(
        n_estimators=500,
        max_leaf_nodes=max_leaf_nodes,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )
    model.fit(train_X, train_Y)
    preds = model.predict(val_X)
    return accuracy_score(val_Y, preds)

In [None]:
dt_scores = [(k, eval_dt(k)) for k in range(4, 81)]
rf_scores = [(k, eval_rf(k)) for k in range(4, 81)]

best_dt = max(dt_scores, key=lambda x: x[1])
best_rf = max(rf_scores, key=lambda x: x[1])
print(f"Best DT: {best_dt[0]} leaves, val ACC={best_dt[1]:.4f}")
print(f"Best RF: {best_rf[0]} leaves, val ACC={best_rf[1]:.4f}")

In [None]:
best_rf_model = RandomForestClassifier(
    n_estimators=500,
    max_leaf_nodes=best_rf[0],
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
best_rf_model.fit(X, y)

In [None]:
test_X = test_data[features]
predictions = best_rf_model.predict(test_X)


output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")