In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Read the original training data
train_df = pd.read_csv("F:\\Titanic\\train.csv")

# Feature engineering
# Title from Name
train_df['Title'] = train_df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
train_df['Title'] = train_df['Title'].replace(['Lady', 'Countess', 'Dona'], 'Royalty')
train_df['Title'] = train_df['Title'].replace(['Mme'], 'Mrs')
train_df['Title'] = train_df['Title'].replace(['Mlle', 'Ms'], 'Miss')
train_df['Title'] = train_df['Title'].replace(['Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer'], 'Special')

# Family Size
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1

# Is Alone
train_df['IsAlone'] = 0
train_df.loc[train_df['FamilySize'] == 1, 'IsAlone'] = 1

# Age Group
train_df['AgeGroup'] = pd.cut(train_df['Age'], bins=[0, 12, 18, 60, 200], labels=['Child', 'Teenager', 'Adult', 'Elderly'])

# Fare per Person
train_df['FarePerPerson'] = train_df['Fare'] / train_df['FamilySize']

# Cabin Deck
train_df['Deck'] = train_df['Cabin'].str[:1]
train_df['Deck'].fillna('Unknown', inplace=True)

# Convert categorical variables into dummy/indicator variables
train_df = pd.get_dummies(train_df, columns=['Sex', 'Embarked', 'Title', 'AgeGroup', 'Deck'])

# Drop unnecessary columns
train_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
train_df_imputed = pd.DataFrame(imputer.fit_transform(train_df), columns=train_df.columns)

# Split the data into features and target variable
X = train_df_imputed.drop('Survived', axis=1)
y = train_df_imputed['Survived']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_model.fit(X_train, y_train)
y_pred_rf = random_forest_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest")
print("Accuracy:", accuracy_rf)
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))
print("\n")

# Logistic Regression
logistic_regression_model = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler(), LogisticRegression(max_iter=1000, random_state=42))
logistic_regression_model.fit(X_train, y_train)
y_pred_lr = logistic_regression_model.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print("Logistic Regression")
print("Accuracy:", accuracy_lr)
print("Classification Report:")
print(classification_report(y_test, y_pred_lr))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lr))
print("\n")

# Support Vector Machine
svm_model = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler(), SVC(random_state=42))
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("Support Vector Machine")
print("Accuracy:", accuracy_svm)
print("Classification Report:")
print(classification_report(y_test, y_pred_svm))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm))
print("\n")

# Gradient Boosting Classifier
gradient_boosting_model = GradientBoostingClassifier(random_state=42)
gradient_boosting_model.fit(X_train, y_train)
y_pred_gb = gradient_boosting_model.predict(X_test)
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print("Gradient Boosting Classifier")
print("Accuracy:", accuracy_gb)
print("Classification Report:")
print(classification_report(y_test, y_pred_gb))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_gb))
print("\n")

Random Forest
Accuracy: 0.8324022346368715
Classification Report:
              precision    recall  f1-score   support

         0.0       0.85      0.87      0.86       105
         1.0       0.81      0.78      0.79        74

    accuracy                           0.83       179
   macro avg       0.83      0.83      0.83       179
weighted avg       0.83      0.83      0.83       179

Confusion Matrix:
[[91 14]
 [16 58]]


Logistic Regression
Accuracy: 0.8212290502793296
Classification Report:
              precision    recall  f1-score   support

         0.0       0.85      0.85      0.85       105
         1.0       0.78      0.78      0.78        74

    accuracy                           0.82       179
   macro avg       0.82      0.82      0.82       179
weighted avg       0.82      0.82      0.82       179

Confusion Matrix:
[[89 16]
 [16 58]]


Support Vector Machine
Accuracy: 0.7988826815642458
Classification Report:
              precision    recall  f1-score   support

