In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load datasets
train_df = pd.read_csv('/kaggle/input/titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/titanic/test.csv')

# Feature Engineering
for df in [train_df, test_df]:
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(['Mlle', 'Ms'], 'Miss')
    df['Title'] = df['Title'].replace(['Mme'], 'Mrs')
    df['Title'] = df['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')
    df['FareBin'] = pd.qcut(df['Fare'].fillna(df['Fare'].median()), 4, labels=False)
    df['AgeBin'] = pd.cut(df['Age'].fillna(df['Age'].median()), bins=[0, 12, 20, 40, 60, 100], labels=False)

# Select features
features = ['Pclass', 'Sex', 'AgeBin', 'SibSp', 'Parch', 'FareBin', 'Embarked', 'Title', 'FamilySize', 'IsAlone']
target = 'Survived'

# Prepare dataset
data = pd.concat([train_df[features], test_df[features]], axis=0).copy()

# Handle missing values
imputer = SimpleImputer(strategy='most_frequent')
data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# Convert categorical variables to numerical
encoder = OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore')
categorical_features = ['Sex', 'Embarked', 'Title']
data_encoded = pd.DataFrame(encoder.fit_transform(data[categorical_features]))
data_encoded.columns = encoder.get_feature_names_out(categorical_features)
data = data.drop(columns=categorical_features).reset_index(drop=True)
data = pd.concat([data, data_encoded], axis=1)

# Split train and test data back
X_train = data.iloc[:len(train_df)]
y_train = train_df[target]
X_test = data.iloc[len(train_df):]

# Train a RandomForest model
model = RandomForestClassifier(n_estimators=200, max_depth=7, min_samples_split=5, min_samples_leaf=2, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Create submission file
submission = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': y_pred})
submission.to_csv('/kaggle/working/submission.csv', index=False)

print("Submission file 'submission.csv' has been created and saved in /kaggle/working/")




Submission file 'submission.csv' has been created and saved in /kaggle/working/
