In [5]:
# ===========================
# Logistic Regression: Titanic Survival Prediction
# ===========================

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# ---------------------------
# 1. Load Datasets
# ---------------------------
train_df = pd.read_csv(r'C:\CJ\All Models\Logistic Regression\Dataset\train.csv')
test_df = pd.read_csv(r'C:\CJ\All Models\Logistic Regression\Dataset\test.csv')
submission_df = pd.read_csv(r'C:\CJ\All Models\Logistic Regression\Dataset\gender_submission.csv')

# ---------------------------
# 2. Preprocessing Function
# ---------------------------
def preprocess(df, is_train=True):
    # Fill missing values
    df['Age'] = df['Age'].fillna(df['Age'].median())
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    
    # Encode categorical features
    le = LabelEncoder()
    for col in ['Sex', 'Embarked']:
        if col in df.columns:
            df[col] = le.fit_transform(df[col])
    
    # Keep PassengerId separately for test set
    passenger_ids = None
    if 'PassengerId' in df.columns:
        passenger_ids = df['PassengerId'].copy()
    
    # Drop unnecessary columns
    drop_cols = ['Cabin', 'Name', 'Ticket', 'PassengerId']
    df = df.drop(columns=drop_cols, errors='ignore')
    
    return df, passenger_ids

# ---------------------------
# 3. Preprocess train and test
# ---------------------------
train_processed, _ = preprocess(train_df, is_train=True)
test_processed, test_ids = preprocess(test_df, is_train=False)

# ---------------------------
# 4. Split Train Data
# ---------------------------
X_train = train_processed.drop('Survived', axis=1)
y_train = train_processed['Survived']

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(test_processed)

# ---------------------------
# 5. Train Logistic Regression
# ---------------------------
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)

# ---------------------------
# 6. Evaluate on Training Set
# ---------------------------
y_train_pred = model.predict(X_train_scaled)
print("Training Accuracy:", accuracy_score(y_train, y_train_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_train, y_train_pred))
print("\nClassification Report:\n", classification_report(y_train, y_train_pred))

# ---------------------------
# 7. Feature Importance
# ---------------------------
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': model.coef_[0]
}).sort_values(by='Coefficient', ascending=False)

print("\nFeature Importance:\n", feature_importance)

# Plot feature importance
plt.figure(figsize=(8,5))
sns.barplot(x='Coefficient', y='Feature', data=feature_importance)
plt.title("Feature Importance in Logistic Regression")
plt.tight_layout()
plt.savefig(r"C:\CJ\All Models\Logistic Regression\Output\Survivors.png", dpi=300)
plt.show()

# ---------------------------
# 8. Predict on Test Set
# ---------------------------
y_test_pred = model.predict(X_test_scaled)
y_test_prob = model.predict_proba(X_test_scaled)[:, 1]

# ---------------------------
# 9. Prepare Submission
# ---------------------------
submission_df['Survived'] = y_test_pred
submission_df.to_csv(r'C:\CJ\All Models\Logistic Regression\Output\titanic_submission.csv', index=False)
print("Submission file saved as 'titanic_submission.csv'")

# ---------------------------
# 10. Sample Predictions with Probabilities
# ---------------------------
pred_df = pd.DataFrame({
    'PassengerId': test_ids,
    'Predicted_Survived': y_test_pred,
    'Survival_Probability': y_test_prob
})
print(pred_df.head())

# ---------------------------
# 11. Optional: Plot Survival Probabilities
# ---------------------------
plt.figure(figsize=(10,5))
sns.histplot(y_test_prob, bins=20, kde=True)
plt.title("Predicted Survival Probabilities (Test Set)")
plt.xlabel("Probability of Survival")
plt.ylabel("Number of Passengers")
plt.tight_layout()
plt.savefig(r"C:\CJ\All Models\Logistic Regression\Output\Survival_Probabilities.png", dpi=300)
plt.show()


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- PassengerId
