In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import pickle
import os

# ===== 1) Load dataset =====
path = r"C:\Users\Ankur\Desktop\Titanic-Dataset.csv"  # update if needed
output_dir = os.path.dirname(path)  # save all PNGs in same folder as dataset
df = pd.read_csv(path)

print("Shape:", df.shape)
print(df.head())
print("\nMissing values:\n", df.isnull().sum())

# ===== 2) Feature engineering =====
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

def extract_title(name):
    if pd.isna(name):
        return "Unknown"
    parts = name.split(',')
    if len(parts) > 1:
        title_part = parts[1].strip().split(' ')[0]
        return title_part.replace('.', '')
    return "Unknown"

df['Title'] = df['Name'].apply(extract_title)
rare_titles = df['Title'].value_counts()[df['Title'].value_counts() < 10].index
df['Title'] = df['Title'].replace(rare_titles, 'Misc')

# Fill Embarked with mode
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode().iloc[0])

# AgeGroup for EDA
df['AgeGroup'] = pd.cut(df['Age'], bins=[0,12,18,30,50,80],
                        labels=['Child','Teen','YoungAdult','Adult','Senior'])

# ===== 3) EDA Visualizations =====
# Survival rate by Pclass
surv_by_pclass = df.groupby('Pclass')['Survived'].mean()
plt.bar(surv_by_pclass.index.astype(str), surv_by_pclass.values)
plt.xlabel("Pclass")
plt.ylabel("Survival Rate")
plt.title("Survival Rate by Passenger Class")
plt.ylim(0, 1)
plt.savefig(os.path.join(output_dir, "survival_by_pclass.png"))
plt.close()

# Survival rate by Sex
surv_by_sex = df.groupby('Sex')['Survived'].mean()
plt.bar(surv_by_sex.index, surv_by_sex.values)
plt.xlabel("Sex")
plt.ylabel("Survival Rate")
plt.title("Survival Rate by Sex")
plt.ylim(0, 1)
plt.savefig(os.path.join(output_dir, "survival_by_sex.png"))
plt.close()

# Survival rate by Age Group
surv_by_agegroup = df.groupby('AgeGroup')['Survived'].mean()
plt.bar(surv_by_agegroup.index.astype(str), surv_by_agegroup.values)
plt.xlabel("Age Group")
plt.ylabel("Survival Rate")
plt.title("Survival Rate by Age Group")
plt.ylim(0, 1)
plt.savefig(os.path.join(output_dir, "survival_by_agegroup.png"))
plt.close()

# Age distribution (Survived vs Not)
plt.hist(df[df['Survived'] == 1]['Age'].dropna(), bins=25, alpha=0.7, label="Survived")
plt.hist(df[df['Survived'] == 0]['Age'].dropna(), bins=25, alpha=0.7, label="Not Survived")
plt.xlabel("Age")
plt.ylabel("Count")
plt.title("Age Distribution by Survival")
plt.legend()
plt.savefig(os.path.join(output_dir, "age_distribution.png"))
plt.close()

# Survival rate by Family Size
surv_by_famsize = df.groupby('FamilySize')['Survived'].mean()
plt.plot(surv_by_famsize.index, surv_by_famsize.values, marker='o')
plt.xlabel("Family Size")
plt.ylabel("Survival Rate")
plt.title("Survival Rate by Family Size")
plt.ylim(0, 1)
plt.savefig(os.path.join(output_dir, "survival_by_family_size.png"))
plt.close()

print(f"EDA plots saved to: {output_dir}")

# ===== 4) Prepare modeling DataFrame =====
df_model = df.drop(columns=['Cabin', 'Ticket', 'AgeGroup'])
df_model['Age'] = df_model.groupby('Title')['Age'].transform(lambda x: x.fillna(x.median()))
df_model['Age'] = df_model['Age'].fillna(df_model['Age'].median())

# Features & target
features = ['Pclass','Sex','Age','Fare','Embarked','Title','FamilySize','IsAlone']
X = df_model[features]
y = df_model['Survived']

# ===== 5) Preprocessing =====
numeric_features = ['Age','Fare','FamilySize']
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_features = ['Pclass','Sex','Embarked','Title','IsAlone']
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# ===== 6) Train/test split =====
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# ===== 7) Logistic Regression =====
log_clf = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])
log_clf.fit(X_train, y_train)
y_pred_log = log_clf.predict(X_test)

print("\nLogistic Regression Metrics:")
print("Accuracy:", accuracy_score(y_test, y_pred_log))
print("Precision:", precision_score(y_test, y_pred_log))
print("Recall:", recall_score(y_test, y_pred_log))
print("F1:", f1_score(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log))

# ===== 8) Random Forest =====
rf_clf = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, n_estimators=100))
])
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)

print("\nRandom Forest Metrics:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Precision:", precision_score(y_test, y_pred_rf))
print("Recall:", recall_score(y_test, y_pred_rf))
print("F1:", f1_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

# ===== 9) Save model =====
model_path = os.path.join(output_dir, "titanic_rf_model.pkl")
with open(model_path, "wb") as f:
    pickle.dump(rf_clf, f)

print(f"\nRandomForest model saved to: {model_path}")


Shape: (891, 12)
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   

  surv_by_agegroup = df.groupby('AgeGroup')['Survived'].mean()


EDA plots saved to: C:\Users\Ankur\Desktop

Logistic Regression Metrics:
Accuracy: 0.8435754189944135
Precision: 0.8153846153846154
Recall: 0.7681159420289855
F1: 0.7910447761194029
              precision    recall  f1-score   support

           0       0.86      0.89      0.88       110
           1       0.82      0.77      0.79        69

    accuracy                           0.84       179
   macro avg       0.84      0.83      0.83       179
weighted avg       0.84      0.84      0.84       179


Random Forest Metrics:
Accuracy: 0.8100558659217877
Precision: 0.7777777777777778
Recall: 0.7101449275362319
F1: 0.7424242424242424
              precision    recall  f1-score   support

           0       0.83      0.87      0.85       110
           1       0.78      0.71      0.74        69

    accuracy                           0.81       179
   macro avg       0.80      0.79      0.80       179
weighted avg       0.81      0.81      0.81       179


RandomForest model saved to: C