In [1]:
# Load libraries and data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the training data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

print('Training data shape:', train_df.shape)
print('Test data shape:', test_df.shape)
train_df.head()

Training data shape: (891, 12)
Test data shape: (418, 11)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
# Data preprocessing and feature engineering
def preprocess_data(df):
    df_processed = df.copy()
    df_processed['Age'] = df_processed['Age'].fillna(df_processed['Age'].median())
    df_processed['Fare'] = df_processed['Fare'].fillna(df_processed['Fare'].median())
    df_processed['Embarked'] = df_processed['Embarked'].fillna('S')
    df_processed['FamilySize'] = df_processed['SibSp'] + df_processed['Parch'] + 1
    df_processed['IsAlone'] = 0
    df_processed.loc[df_processed['FamilySize'] == 1, 'IsAlone'] = 1
    df_processed['Sex'] = df_processed['Sex'].map({'male': 0, 'female': 1})
    df_processed['Embarked'] = df_processed['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
    return df_processed

# Preprocess both datasets
train_processed = preprocess_data(train_df)
test_processed = preprocess_data(test_df)

print('Missing values in processed training data:')
print(train_processed.isnull().sum())
train_processed.head()

Missing values in processed training data:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
FamilySize       0
IsAlone          0
dtype: int64


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,IsAlone
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,0,2,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1,2,0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,0,1,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,0,2,0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,0,1,1


In [3]:
# Select features for modeling
feature_columns = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'FamilySize', 'IsAlone']

X = train_processed[feature_columns]
y = train_processed['Survived']

print('Features selected:')
print(feature_columns)
print('\nFeature matrix shape:', X.shape)
print('Target variable shape:', y.shape)

Features selected:
['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'FamilySize', 'IsAlone']

Feature matrix shape: (891, 9)
Target variable shape: (891,)


In [4]:
# Split data for training and validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print('Training set size:', X_train.shape)
print('Validation set size:', X_val.shape)
print('Training survival rate:', y_train.mean())
print('Validation survival rate:', y_val.mean())

Training set size: (712, 9)
Validation set size: (179, 9)
Training survival rate: 0.37640449438202245
Validation survival rate: 0.4134078212290503


In [5]:
# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
rf_pred = rf_model.predict(X_val)
rf_accuracy = accuracy_score(y_val, rf_pred)

print('Random Forest Results:')
print(f'Accuracy: {rf_accuracy:.4f}')
print('\nClassification Report:')
print(classification_report(y_val, rf_pred))

Random Forest Results:
Accuracy: 0.8212

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.86      0.85       105
           1       0.79      0.77      0.78        74

    accuracy                           0.82       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179



In [6]:
# Train Logistic Regression model
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)

# Make predictions
lr_pred = lr_model.predict(X_val)
lr_accuracy = accuracy_score(y_val, lr_pred)

print('Logistic Regression Results:')
print(f'Accuracy: {lr_accuracy:.4f}')
print('\nClassification Report:')
print(classification_report(y_val, lr_pred))

Logistic Regression Results:
Accuracy: 0.7989

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.86      0.83       105
           1       0.78      0.72      0.75        74

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179



In [7]:
# Compare models
print('Model Comparison:')
print(f'Random Forest Accuracy: {rf_accuracy:.4f}')
print(f'Logistic Regression Accuracy: {lr_accuracy:.4f}')

# Feature importance from Random Forest
feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print('\nFeature Importance (Random Forest):')
print(feature_importance)

Model Comparison:
Random Forest Accuracy: 0.8212
Logistic Regression Accuracy: 0.7989

Feature Importance (Random Forest):
      feature  importance
1         Sex    0.279290
5        Fare    0.257920
2         Age    0.242859
0      Pclass    0.086490
7  FamilySize    0.042812
6    Embarked    0.032882
3       SibSp    0.029642
4       Parch    0.019079
8     IsAlone    0.009026


In [8]:
# Make predictions on test data using the best model
X_test = test_processed[feature_columns]

# Train final model on full training data
final_model = RandomForestClassifier(n_estimators=100, random_state=42)
final_model.fit(X, y)

# Make predictions on test set
test_predictions = final_model.predict(X_test)

print(f'Made predictions for {len(test_predictions)} passengers in test set')
print(f'Predicted survival rate: {test_predictions.mean():.4f}')

Made predictions for 418 passengers in test set
Predicted survival rate: 0.3828


In [9]:
# Create submission file
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': test_predictions
})

# Save submission file
submission.to_csv('titanic_submission.csv', index=False)

print('Submission file created: titanic_submission.csv')
print('\nFirst 10 predictions:')
print(submission.head(10))
print(f'\nTotal predictions: {len(submission)}')

Submission file created: titanic_submission.csv

First 10 predictions:
   PassengerId  Survived
0          892         0
1          893         0
2          894         0
3          895         1
4          896         1
5          897         0
6          898         0
7          899         0
8          900         1
9          901         0

Total predictions: 418


In [10]:
# Model evaluation summary
print('=== MODEL EVALUATION SUMMARY ===')
print(f'Random Forest Validation Accuracy: {rf_accuracy:.4f}')
print(f'Logistic Regression Validation Accuracy: {lr_accuracy:.4f}')
print(f'\nBest Model: Random Forest')
print(f'Test Set Predictions: {len(test_predictions)} passengers')
print(f'Predicted Survival Rate: {test_predictions.mean():.4f}')
print(f'\nMost Important Features:')
for i, row in feature_importance.head(5).iterrows():
    print(f'{row["feature"]}: {row["importance"]:.4f}')

=== MODEL EVALUATION SUMMARY ===
Random Forest Validation Accuracy: 0.8212
Logistic Regression Validation Accuracy: 0.7989

Best Model: Random Forest
Test Set Predictions: 418 passengers
Predicted Survival Rate: 0.3828

Most Important Features:
Sex: 0.2793
Fare: 0.2579
Age: 0.2429
Pclass: 0.0865
FamilySize: 0.0428
