In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


df = pd.read_csv("final_train.csv")
df.head()

In [None]:
df_2 = pd.read_csv("train.csv")

df['sex'] = df_2['Sex']
df['embarked'] = df_2['Embarked']
df['survived'] = df_2['Survived']

Descriptions


In [None]:
print(df.shape)
print(df.columns.tolist())
print(df.describe())
print(df.nunique())
df.dtypes

Cleaning column names

In [None]:
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace(r'[^\w]', "", regex= True)

Converting data types

In [None]:
df.dtypes

In [None]:
df['survived'] = df['survived'].astype(bool)
df['passengerid'] = df['passengerid'].astype('str')

categories = ['pclass', 'sex', 'embarked']
df[categories] = df[categories].astype('category')


Null values

In [None]:
print(df.isnull().sum())
df.isnull().mean() * 100

The missing values in age column have been filled using prediction model based on:
pclass, sibsp, parch, fare, embarked

In [None]:
df['embarked']=df['embarked'].fillna(df['embarked'].mode()[0])

The missing values in cabin have been filled by extracting deck number from known cabin numbers (first letter of the number) and a model has been trained on it, using features like pclass, fare, sbsp, parch and embarked

In [None]:
df.duplicated().sum()

No duplicates

In [None]:
df.isnull().sum()


Exporting the cleaned dataset

In [None]:
df.to_csv('titanic_data_cleaning.csv', index=False)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


df = pd.read_csv("titanic_data_cleaning.csv")
df.head()

Univariate Analysis

1) numerical columns

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

num_cols = ["age", "fare", "sibsp", "parch"]

for col in num_cols:
    plt.figure(figsize=(8, 4))
    sns.histplot(df[col], kde=True, bins=30)
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.grid(True)
    plt.show()

    print(f"\n Summary Stats for {col}:\n{df[col].describe()}\n")


categorical columns

In [None]:
cat_cols = ["sex", "pclass", "embarked", "survived", "deck"]

for col in cat_cols:
    plt.figure(figsize=(6,4))
    sns.countplot(data=df, x=col)
    plt.title(f"Count of {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.grid(True)
    plt.show()

    print(f"\n🔢 Value Counts for {col}:\n{df[col].value_counts(dropna=False)}\n")


Bivariate and Multivariate analysis

1) Deck vs Survival

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(data=df, x="deck", hue="survived")
plt.title("Survival Count by Deck")
plt.xlabel("Deck")
plt.ylabel("Passenger Count")
plt.legend(title="Survived", labels=["No", "Yes"])
plt.grid(True)
plt.show()


If we figure out what kind of passengers were staying at deck F we can uncover some insights, but it should be remembered that most of the data in deck is predicted

2) pclass vs survival

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(data=df, x='pclass', hue= 'survived')
plt.title('Survival Count by Passenger Class')
plt.xlabel("Passenger Class")
plt.ylabel("Count")
plt.legend(title="Survived", labels=["No", "Yes"])
plt.grid(True)
plt.show

Most casualities belonged from the 3rd class, this makes sense because the first class might have been prioritized

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


df = pd.read_csv("titanic_data_cleaning.csv")
df_2 = pd.read_csv('test_titanic_data_cleaning.csv')

# df is the training dataset and df_2 is the testing

Adding some new features

In [None]:
df['family_size'] = df['sibsp'] + df['parch'] + 1
df_2['family_size'] = df_2['sibsp'] + df_2['parch'] + 1


In [None]:
df['is_alone'] = np.where(df['family_size'] == 1, 1, 0)

df_2['is_alone'] = np.where(df_2['family_size'] == 1, 1, 0)


In [None]:
conditions = [
    df['age'] <= 14,
    df['sex'] == 'female'
]
choices = [
    'child',
    'woman'
]
df['category'] = np.select(conditions, choices,  default='man')

In [None]:
conditions = [
    df_2['age'] <= 14,
    df_2['sex'] == 'female'
]
choices = [
    'child',
    'woman'
]
df_2['category'] = np.select(conditions, choices,  default='man')

Numerical Scaling

Starting with Fare, first checking outliers

In [None]:
sns.boxplot(data=df, x='fare')
plt.show()

df['fare'].describe()

In [None]:
Q1 = df['fare'].quantile(0.25)
Q3 = df['fare'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df['fare'] < lower_bound) | (df['fare']> upper_bound )]
print ("num of outliers :" , len(outliers))
print (outliers)

since we have some outliers and our std is higher than mean, we will use Robust scaling

In [None]:
from sklearn.preprocessing import RobustScaler
import joblib
scaler = RobustScaler()

df['fare_scaled'] = scaler.fit_transform(df[['fare']])


joblib.dump(scaler, 'scaler.fare')

df_2['fare_scaled'] = scaler.transform(df_2[['fare']])



Analyzing outliers for other numerical columns and applying scaling

In [None]:

num_cols = ['sibsp', 'parch', 'age']
for col in num_cols:
    plt.title(col)
    sns.boxplot(data=df, x=col)
    plt.show()
    print(df[col].describe())

Outlier handling for age

In [None]:
Q1 = df['age'].quantile(0.25)
Q3 = df['age'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df['age'] < lower_bound) | (df['age']> upper_bound )]
print ("num of outliers :" , len(outliers))
print (outliers)

In [None]:
df['age'] = df['age'].clip(upper=60)
print(df['age'])

checking for outliers again

In [None]:
plt.title('age_clipped')
sns.boxplot(data=df, x="age")
plt.show()
print(df['age'].describe())

Applying Z score standardization

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

df['age'] = scaler.fit_transform(df[['age']])


joblib.dump(scaler, 'scaler.age')

df_2['age'] = scaler.transform(df_2[['age']])


Now we analyze outliers for sibsp and parch, and handle them as well

In [None]:
Q1 = df['sibsp'].quantile(0.25)
Q3 = df['sibsp'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df['sibsp'] < lower_bound) | (df['sibsp']> upper_bound )]
print ("num of outliers :" , len(outliers))
print (outliers)

We can't drop our outliers in sibsp because these are important and may give us information later, so we are going to use robust scaling

In [None]:
scaler = RobustScaler()

df['sibsp'] = scaler.fit_transform(df[['sibsp']])

joblib.dump(scaler, 'scaler.sibsp')

df_2['sibsp'] = scaler.transform(df_2[['sibsp']])

Now parch, first outliers then scaling

In [None]:
Q1 = df['parch'].quantile(0.25)
Q3 = df['parch'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df['parch'] < lower_bound) | (df['parch']> upper_bound )]
print ("num of outliers :" , len(outliers))
print (outliers)

In [None]:
scaler = RobustScaler()

df['parch'] = scaler.fit_transform(df[['parch']])

joblib.dump(scaler, 'scaler.parch')

df_2['parch'] = scaler.transform(df_2[['parch']])

Binary Encoding sex

In [None]:
df['sex'] = df['sex'].map({'male':0, 'female': 1})
df_2['sex'] = df_2['sex'].map({'male':0, 'female': 1})

Dropping unnecessary columns

In [None]:
df = df.drop(['passengerid', 'name'], axis=1)

df = df.drop(['ticket', 'cabin_filled'], axis=1)

df_2 = df_2.drop(['passengerid', 'name'], axis=1)

df_2 = df_2.drop(['ticket'], axis=1)

Finding correlation with survived

In [None]:
from sklearn.preprocessing import LabelEncoder


# Create a copy of your dataframe to avoid modifying the original
df_encoded = df.copy()

# Define your categorical columns
categorical_cols = ['embarked', 'deck', 'category']

# Apply label encoding to categorical variables
label_encoders = {}
for col in categorical_cols:
    if col in df_encoded.columns:
        le = LabelEncoder()
        # Handle missing values by filling them first (optional)
        df_encoded[col] = df_encoded[col].fillna('Unknown')
        df_encoded[col] = le.fit_transform(df_encoded[col])
        label_encoders[col] = le  # Store encoder in case you need it later

# 1. Start with domain knowledge features
core_features = ['pclass', 'sex', 'age', 'fare_scaled', 'embarked']

# 2. Add engineered features with good target correlation
engineered_features = ['family_size', 'is_alone']

# 3. Check correlations and remove redundant ones
final_features = []
correlation_with_target = df_encoded.corr()['survived'].abs()

for feature in core_features + engineered_features:
    if feature in correlation_with_target.index:
        print(f"{feature}: {correlation_with_target[feature]:.3f}")
        final_features.append(feature)

print("Final feature set:", final_features)








Only selecting final features

In [None]:
# Define your final feature list (including target for training set)
final_features = ['pclass', 'sex', 'age', 'fare_scaled', 'embarked', 'sibsp', 'parch', 'family_size', 'is_alone']

# For training set - keep features + target
df_final = df[final_features + ['survived']].copy()

# For test set - keep only features (no 'survived' column in test set)
df_2_final = df_2[final_features].copy()

print("Training set shape:", df_final.shape)
print("Test set shape:", df_2_final.shape)
print("Training columns:", df_final.columns.tolist())
print("Test columns:", df_2_final.columns.tolist())

Cateogorical encoding

In [None]:
# Separate features/target
from sklearn.preprocessing import OneHotEncoder

categorical_cols = ['embarked']
df_cleaned = df_final.copy()
numerical_cols = [col for col in df_cleaned.columns if col not in categorical_cols + ['survived']]

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded = encoder.fit_transform(df_cleaned[categorical_cols])

X = np.concatenate([df_cleaned[numerical_cols].values, encoded], axis=1)
y = df_cleaned['survived'].values

# Split into train/test for evaluation
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Run your comparison


Now applying models

In [None]:
# Import required libraries
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

def evaluate_models(X_train, y_train, X_test, y_test):
    """
    Evaluate selected classification models and compare their performance
    """
    # Define models to test
    models = {
        'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'Decision Tree': DecisionTreeClassifier(random_state=42)
    }
    
    results = {}
    trained_models = {}
    
    print("=" * 80)
    print("MODEL PERFORMANCE COMPARISON")
    print("=" * 80)
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    for name, model in models.items():
        print(f"\nTesting {name}...")
        
        try:
            cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
            
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            
            test_accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, average='weighted')
            recall = recall_score(y_test, y_pred, average='weighted')
            f1 = f1_score(y_test, y_pred, average='weighted')
            
            results[name] = {
                'CV_Mean': cv_scores.mean(),
                'CV_Std': cv_scores.std(),
                'Test_Accuracy': test_accuracy,
                'Precision': precision,
                'Recall': recall,
                'F1_Score': f1,
                'CV_Scores': cv_scores
            }
            
            trained_models[name] = model
            
            print(f"  Cross-Val Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")
            print(f"  Test Accuracy: {test_accuracy:.4f}")
            print(f"  F1-Score: {f1:.4f}")
            
        except Exception as e:
            print(f"  Error with {name}: {str(e)}")
            continue
    
    return results, trained_models

def plot_model_comparison(results):
    import pandas as pd
    df_results = pd.DataFrame(results).T
    
    # Drop problematic column
    if 'CV_Scores' in df_results.columns:
        df_results = df_results.drop(columns=['CV_Scores'])
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('Model Performance Comparison', fontsize=16, fontweight='bold')
    
    axes[0,0].bar(df_results.index, df_results['CV_Mean'], 
                  yerr=df_results['CV_Std'], capsize=5, alpha=0.7)
    axes[0,0].set_title('Cross-Validation Accuracy')
    axes[0,0].set_ylabel('Accuracy')
    axes[0,0].tick_params(axis='x', rotation=45)
    axes[0,0].grid(True, alpha=0.3)
    
    axes[0,1].bar(df_results.index, df_results['Test_Accuracy'], 
                  color='orange', alpha=0.7)
    axes[0,1].set_title('Test Set Accuracy')
    axes[0,1].set_ylabel('Accuracy')
    axes[0,1].tick_params(axis='x', rotation=45)
    axes[0,1].grid(True, alpha=0.3)
    
    axes[1,0].bar(df_results.index, df_results['F1_Score'], 
                  color='green', alpha=0.7)
    axes[1,0].set_title('F1-Score Comparison')
    axes[1,0].set_ylabel('F1-Score')
    axes[1,0].tick_params(axis='x', rotation=45)
    axes[1,0].grid(True, alpha=0.3)
    
    metrics_df = df_results[['CV_Mean', 'Test_Accuracy', 'Precision', 'Recall', 'F1_Score']]

    metrics_df = df_results[['CV_Mean', 'Test_Accuracy', 'Precision', 'Recall', 'F1_Score']]

# Force convert to numeric - this will coerce invalid entries to NaN, then fill NaN with 0 or drop rows
    metrics_df = metrics_df.apply(pd.to_numeric, errors='coerce').fillna(0)

    sns.heatmap(metrics_df.T, annot=True, fmt='.3f', cmap='YlOrRd', 
                ax=axes[1,1], cbar_kws={'label': 'Score'})
    axes[1,1].set_title('All Metrics Heatmap')
    axes[1,1].set_xlabel('Models')
    
    plt.tight_layout()
    plt.show()


def get_top_models(results, top_n=3):
    """
    Identify and return top performing models
    """
    sorted_models = sorted(results.items(), 
                          key=lambda x: x[1]['Test_Accuracy'], 
                          reverse=True)
    
    print(f"\n{'='*50}")
    print(f"TOP {top_n} PERFORMING MODELS")
    print(f"{'='*50}")
    
    for i, (name, metrics) in enumerate(sorted_models[:top_n], 1):
        print(f"\n{i}. {name}")
        print(f"   Test Accuracy: {metrics['Test_Accuracy']:.4f}")
        print(f"   CV Accuracy: {metrics['CV_Mean']:.4f} (+/- {metrics['CV_Std']*2:.4f})")
        print(f"   F1-Score: {metrics['F1_Score']:.4f}")
        print(f"   Precision: {metrics['Precision']:.4f}")
        print(f"   Recall: {metrics['Recall']:.4f}")
    
    return [model[0] for model in sorted_models[:top_n]]

def detailed_model_analysis(model_name, model, X_test, y_test):
    """
    Provide detailed analysis for a specific model
    """
    import pandas as pd
    y_pred = model.predict(X_test)
    
    print(f"\n{'='*60}")
    print(f"DETAILED ANALYSIS: {model_name}")
    print(f"{'='*60}")
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Not Survived', 'Survived'],
                yticklabels=['Not Survived', 'Survived'])
    plt.title(f'Confusion Matrix: {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()
    
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        feature_names = [f'Feature_{i}' for i in range(len(importances))]
        importance_df = pd.DataFrame({
            'Feature': feature_names,
            'Importance': importances
        }).sort_values('Importance', ascending=False)
        
        plt.figure(figsize=(10, 6))
        sns.barplot(data=importance_df.head(10), x='Importance', y='Feature')
        plt.title(f'Feature Importance: {model_name}')
        plt.xlabel('Importance')
        plt.show()
        
        print("\nTop 10 Most Important Features:")
        print(importance_df.head(10))

def create_ensemble_model(top_models, trained_models, X_train, y_train):
    """
    Create an ensemble model using top performing models
    """
    print(f"\n{'='*50}")
    print("CREATING ENSEMBLE MODEL")
    print(f"{'='*50}")
    
    ensemble_estimators = [(name, trained_models[name]) for name in top_models[:3]]
    
    ensemble_model = VotingClassifier(
        estimators=ensemble_estimators,
        voting='soft'  # Use probabilities for voting
    )
    
    ensemble_model.fit(X_train, y_train)
    
    print(f"Ensemble created with models: {[name for name, _ in ensemble_estimators]}")
    
    return ensemble_model

def run_model_comparison(X_train, y_train, X_test, y_test, feature_names=None):
    """
    Run complete model comparison analysis
    """
    print("Starting Model Comparison Analysis...")
    print(f"Training set size: {X_train.shape}")
    print(f"Test set size: {X_test.shape}")
    
    results, trained_models = evaluate_models(X_train, y_train, X_test, y_test)
    
    plot_model_comparison(results)
    
    top_models = get_top_models(results, top_n=3)
    
    best_model_name = top_models[0]
    best_model = trained_models[best_model_name]
    detailed_model_analysis(best_model_name, best_model, X_test, y_test)
    
    ensemble_model = create_ensemble_model(top_models, trained_models, X_train, y_train)
    
    ensemble_pred = ensemble_model.predict(X_test)
    ensemble_accuracy = accuracy_score(y_test, ensemble_pred)
    ensemble_f1 = f1_score(y_test, ensemble_pred, average='weighted')
    
    print(f"\n{'='*50}")
    print("ENSEMBLE MODEL PERFORMANCE")
    print(f"{'='*50}")
    print(f"Ensemble Accuracy: {ensemble_accuracy:.4f}")
    print(f"Ensemble F1-Score: {ensemble_f1:.4f}")
    
    print(f"\n{'='*60}")
    print("FINAL RECOMMENDATIONS")
    print(f"{'='*60}")
    print(f"Best Single Model: {best_model_name}")
    print(f"Best Single Model Accuracy: {results[best_model_name]['Test_Accuracy']:.4f}")
    print(f"Ensemble Model Accuracy: {ensemble_accuracy:.4f}")
    
    if ensemble_accuracy > results[best_model_name]['Test_Accuracy']:
        print("✓ Recommendation: Use the Ensemble Model")
        recommended_model = ensemble_model
    else:
        print("✓ Recommendation: Use the Best Single Model")
        recommended_model = best_model
    
    return results, recommended_model, ensemble_model

# Usage example:
# results, best_model, ensemble_model = run_model_comparison(X_train, y_train, X_test, y_test)


In [None]:
results, best_model, ensemble_model = run_model_comparison(X_train, y_train, X_test, y_test)


Making actual predictions


In [None]:
df_2_final.isnull().sum()

Categorical encoding for df_2_final which is the testing dataset with the final features


In [None]:
from sklearn.preprocessing import OneHotEncoder




categorical_cols = ['embarked']



numerical_cols = [col for col in df_final.columns if col not in categorical_cols]
numerical_cols_test = [col for col in df_2_final.columns if col not in categorical_cols]



# Encode categorical
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_train_encoded = encoder.fit_transform(df_final[categorical_cols])

# Save the encoder
joblib.dump(encoder, 'encoder_titanic.pkl')

X_test_encoded = encoder.transform(df_2_final[categorical_cols])


# Combine features
x_train = np.concatenate([
    df_final[numerical_cols].values,
    X_train_encoded
], axis=1)

x_test = np.concatenate([
    df_2_final[numerical_cols_test].values,
    X_test_encoded
], axis=1)





best_model is the model chosen in models comparison, it is trained on training dataset

In [None]:
y_test_pred = best_model.predict(x_test)





In [None]:
joblib.dump(best_model, 'titanic_model.pkl')


In [None]:

# making a new data frame with passenger id and survived or not column

df_original_test = pd.read_csv('test.csv')
                               
p_id = df_original_test['PassengerId']
survived = y_test_pred

new_df = pd.DataFrame({
    'id': p_id,
    'survived': survived
})



new_df.to_csv('test_predictions.csv', index=False)
