# Explore here

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib # For saving the model and preprocessors
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [8]:
# --- 1.1: Load the Titanic dataset ---
print("--- 1.1: Loading the Titanic dataset ---")
# Using a direct URL for convenience, similar to previous projects
titanic_url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'

try:
    df = pd.read_csv(titanic_url)
    print("Titanic dataset loaded successfully.")
    print("DataFrame head:")
    print(df.head())
    print("\nDataFrame info:")
    df.info()
    print("\nSurvival (target) value counts:")
    print(df['Survived'].value_counts())
except Exception as e:
    print(f"Error loading Titanic dataset: {e}")
    print("Please ensure the URL is correct or check your internet connection.")

--- 1.1: Loading the Titanic dataset ---
Titanic dataset loaded successfully.
DataFrame head:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0      

In [9]:
# --- 1.2: Basic EDA and Preprocessing ---
print("\n--- 1.2: Basic EDA and Preprocessing ---")

# Drop irrelevant columns for this basic model
# 'PassengerId', 'Name', 'Ticket', 'Cabin' are often dropped or require complex processing
df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
print("Dropped 'PassengerId', 'Name', 'Ticket', 'Cabin' columns.")

# Handle missing values
# 'Age': Fill with median
df['Age'].fillna(df['Age'].median(), inplace=True)
# 'Embarked': Fill with mode (most frequent value)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
print("Handled missing values in 'Age' (median) and 'Embarked' (mode).")

print("\nMissing values after handling:")
print(df.isnull().sum())

# Define features (X) and target (y)
X = df.drop('Survived', axis=1)
y = df['Survived']
print(f"\nShape of X: {X.shape}")
print(f"Shape of y: {y.shape}")

# Identify numerical and categorical features for preprocessing pipelines
numerical_features = ['Age', 'Fare', 'SibSp', 'Parch']
categorical_features = ['Pclass', 'Sex', 'Embarked'] # Pclass is numerical but treated as categorical here

# Create preprocessing pipelines for numerical and categorical features
# Numerical pipeline: Impute (if not already done) and Scale
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Categorical pipeline: One-hot encode
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore')) # handle_unknown='ignore' for unseen categories in test set
])

# Create a preprocessor using ColumnTransformer
# This applies different transformations to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

print("\nPreprocessing pipelines defined.")


--- 1.2: Basic EDA and Preprocessing ---
Dropped 'PassengerId', 'Name', 'Ticket', 'Cabin' columns.
Handled missing values in 'Age' (median) and 'Embarked' (mode).

Missing values after handling:
Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

Shape of X: (891, 7)
Shape of y: (891,)

Preprocessing pipelines defined.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)


In [10]:
# --- 1.3: Train a simple classification model (Logistic Regression) ---
print("\n--- 1.3: Training a Logistic Regression model ---")

# Combine preprocessing and model into a single pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000)) # Increased max_iter for convergence
])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")

# Train the model
print("Training the model...")
model_pipeline.fit(X_train, y_train)
print("Model training complete.")




--- 1.3: Training a Logistic Regression model ---
Shape of X_train: (712, 7)
Shape of X_test: (179, 7)
Training the model...
Model training complete.


In [11]:
# --- 1.4: Evaluate the model ---
print("\n--- 1.4: Evaluating the model ---")
y_pred = model_pipeline.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


--- 1.4: Evaluating the model ---
Accuracy: 0.8045
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.89      0.85       110
           1       0.79      0.67      0.72        69

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.80      0.80      0.80       179

Confusion Matrix:
 [[98 12]
 [23 46]]


In [12]:
# --- 1.5: Save the model and preprocessors ---
print("\n--- 1.5: Saving the model and preprocessors ---")

# Ensure the 'models' directory exists
models_dir = 'models/'
os.makedirs(models_dir, exist_ok=True)

# Save the entire pipeline (which includes the preprocessor and classifier)
model_filename = os.path.join(models_dir, 'titanic_survival_predictor_pipeline.joblib')
joblib.dump(model_pipeline, model_filename)
print(f"Full model pipeline saved to {model_filename}")

# Note: When saving the entire pipeline, you don't need to save scaler/onehotencoder separately,
# as they are part of the pipeline.

print("\nModel training and saving complete. You can now proceed to Step 2 to build your Streamlit app.")



--- 1.5: Saving the model and preprocessors ---
Full model pipeline saved to models/titanic_survival_predictor_pipeline.joblib

Model training and saving complete. You can now proceed to Step 2 to build your Streamlit app.
