In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score

# Load dataset
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
data = pd.read_csv(url)

# Feature Engineering
data['FamilySize'] = data['SibSp'] + data['Parch']
data['IsAlone'] = (data['FamilySize'] == 0).astype(int)
data['Fare'] = data['Fare'].fillna(data['Fare'].median())
data['Nickname'] = data['Name'].apply(lambda x:x.split()[1][:-1])


# Define features and target variable
X = data.drop(['Survived', 'Name', 'Ticket', 'Cabin', 'PassengerId'], axis=1)
y = data['Survived']

# Define categorical and numerical columns
categorical_features = ['Sex', 'Embarked','Nickname']
numerical_features = ['Age', 'Fare', 'FamilySize', 'Pclass', 'SibSp', 'Parch']

# Preprocessing steps for each column type
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')), 
            ('scaler', StandardScaler())]), numerical_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))]), categorical_features)
    ])

# Model pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter tuning with GridSearch
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [5, 10, 15],
    'classifier__min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Best Parameters: {'classifier__max_depth': 10, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 300}
Test Accuracy: 82.12%


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

# Load dataset
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
data = pd.read_csv(url)

# Feature Engineering
data['FamilySize'] = data['SibSp'] + data['Parch']
data['IsAlone'] = (data['FamilySize'] == 0).astype(int)
data['Fare'] = data['Fare'].fillna(data['Fare'].median())

# Simplify titles
data['Title'] = data['Name'].apply(lambda x:x.split()[1][:-1])
data['Title'] = data['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 
                                       'Sir', 'Jonkheer', 'Dona'], 'Rare')
data['Title'] = data['Title'].replace(['Mlle', 'Ms'], 'Miss')
data['Title'] = data['Title'].replace('Mme', 'Mrs')

# Bin ages into categories (optional)
data['AgeBin'] = pd.cut(data['Age'], bins=[0, 12, 18, 35, 60, 100], labels=['Child', 'Teen', 'Young Adult', 'Adult', 'Senior'])

# Drop unnecessary columns
X = data.drop(['Survived', 'Name', 'Ticket', 'Cabin', 'PassengerId', 'Age'], axis=1)
y = data['Survived']

# Define categorical and numerical columns
categorical_features = ['Sex', 'Embarked', 'Title', 'AgeBin']
numerical_features = ['Fare', 'FamilySize', 'Pclass', 'SibSp', 'Parch', 'IsAlone']

# Preprocessing steps for each column type
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')), 
            ('scaler', StandardScaler())]), numerical_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))]), categorical_features)
    ])

# Define models
rf_clf = RandomForestClassifier(random_state=42, n_estimators=200, max_depth=10)
gb_clf = GradientBoostingClassifier(random_state=42, n_estimators=200, max_depth=5)
xgb_clf = XGBClassifier(random_state=42, n_estimators=200, max_depth=5, use_label_encoder=False, eval_metric='logloss')

# Ensemble with Voting Classifier
ensemble_model = VotingClassifier(estimators=[
    ('rf', rf_clf), ('gb', gb_clf), ('xgb', xgb_clf)], voting='soft')

# Model pipeline
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', ensemble_model)
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model_pipeline.fit(X_train, y_train)

# Evaluate model
y_pred = model_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 83.24%


Parameters: { "use_label_encoder" } are not used.

