<a href="https://colab.research.google.com/github/CaroMusangi1/InternIntelligence_EndtoEndMachineLearningPipeline/blob/master/Task5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# 📦 Step 1: Import Basic Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# 🔧 Step 2: Install Required Packages (Google Colab Only)

!pip install optuna xgboost scikit-learn

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.3.0


In [7]:
# 📊 Step 3: Load and Preprocess the Titanic Dataset

# Load selected columns and drop missing values
df = sns.load_dataset("titanic")[['age', 'fare', 'sex', 'class', 'survived']].dropna()

# Convert categorical variables to numeric
df['sex'] = df['sex'].map({'male': 0, 'female': 1})
df['class'] = df['class'].map({'First': 1, 'Second': 2, 'Third': 3})

# Define features and target
X = df.drop('survived', axis=1)
y = df['survived']

In [8]:
# 🧱 Step 4: Define the Preprocessing and Modeling Pipeline

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer

# Select numeric features
num_features = ['age', 'fare', 'class']

# Create a numeric transformer pipeline
num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Combine transformers in a ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', num_transformer, num_features)
])

# Create the full pipeline with a Logistic Regression model
clf = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

In [9]:
# 🧪 Step 5: Train/Test Split and Model Training

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
clf.fit(X_train, y_train)

In [10]:
# 📈 Step 6: Make Predictions and Evaluate the Model

# Predict on the test set
y_pred = clf.predict(X_test)

# Print classification metrics
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.69      0.80      0.74        87
           1       0.60      0.45      0.51        56

    accuracy                           0.66       143
   macro avg       0.64      0.63      0.63       143
weighted avg       0.65      0.66      0.65       143



In [11]:
### 🧾 Example Output (May Vary Slightly)

              precision    recall  f1-score   support

           0       0.69      0.80      0.74        87
           1       0.60      0.45      0.51        56

    accuracy                           0.66       143
   macro avg       0.64      0.63      0.63       143
weighted avg       0.65      0.66      0.65       143


In [12]:
# 💾 Step 7: Save the Trained Pipeline Model

import joblib
joblib.dump(clf, "ml_pipeline_model.pkl")

['ml_pipeline_model.pkl']