# Pipeline 

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score

# Load dataset
data = load_iris()
X = data.data
y = data.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Step 1: Data scaling
    ('rf', RandomForestClassifier(random_state=42))  # Step 2: Random Forest Classifier
])

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'rf__n_estimators': [50, 100],  # Number of trees in the forest
    'rf__max_depth': [10, 20],      # Maximum depth of the trees
    'rf__min_samples_split': [2, 5]  # Minimum samples required to split a node
}

# Perform Grid Search with Cross Validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5)  # 5-fold cross-validation

# Train the model using the training data
grid_search.fit(X_train, y_train)

# Best model after hyperparameter tuning
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Best Model Parameters: {grid_search.best_params_}')
print(f'Test Set Accuracy: {accuracy:.4f}')


Best Model Parameters: {'rf__max_depth': 10, 'rf__min_samples_split': 2, 'rf__n_estimators': 50}
Test Set Accuracy: 1.0000


from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score

# Load dataset
data = load_iris()
X = data.data
y = data.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Step 1: Data scaling
    ('rf', RandomForestClassifier(random_state=42))  # Step 2: Random Forest Classifier
])

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'rf__n_estimators': [50, 100],  # Number of trees in the forest
    'rf__max_depth': [10, 20],      # Maximum depth of the trees
    'rf__min_samples_split': [2, 5]  # Minimum samples required to split a node
}

# Perform Grid Search with Cross Validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5)  # 5-fold cross-validation

# Train the model using the training data
grid_search.fit(X_train, y_train)

# Best model after hyperparameter tuning
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Best Model Parameters: {grid_search.best_params_}')
print(f'Test Set Accuracy: {accuracy:.4f}')


Selecting best model in Pipeline

In [2]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Load the Titanic dataset from Seaborn
titanic_data = sns.load_dataset('titanic')

# Select features and target variable
X = titanic_data[['pclass', 'sex', 'age', 'fare', 'embarked']]
y = titanic_data['survived']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a list of models to evaluate
models = [
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42))
]

best_model = None
best_accuracy = 0.0

# Iterate over the models and evaluate their performance
for name, model in models:
    # Create a pipeline for each model
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore')),
        ('model', model)
    ])
    
    # Perform cross-validation
    scores = cross_val_score(pipeline, X_train, y_train, cv=5)
    
    # Calculate mean accuracy
    mean_accuracy = scores.mean()
    
    # Fit the pipeline on the training data
    pipeline.fit(X_train, y_train)
    
    # Make predictions on the test data
    y_pred = pipeline.predict(X_test)
    
    # Calculate accuracy score
    accuracy = accuracy_score(y_test, y_pred)
    
    # Print the performance metrics
    print("Model:", name)
    print("Cross-validation Accuracy:", mean_accuracy)
    print("Test Accuracy:", accuracy)
    print()
    
    # Check if the current model has the best accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = pipeline

# Retrieve the best model
print("Best Model:", best_model)

Model: Random Forest
Cross-validation Accuracy: 0.7991529597163399
Test Accuracy: 0.8379888268156425

Model: Gradient Boosting
Cross-validation Accuracy: 0.8061952132374668
Test Accuracy: 0.7988826815642458

Best Model: Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore')),
                ('model', RandomForestClassifier(random_state=42))])


Why Random Forest is the Best Model:

    Performance Metrics:
        Cross-validation Accuracy: 0.7991, which is slightly lower than Gradient Boosting (0.8062).
        Test Accuracy: 0.8379, higher than Gradient Boosting (0.7989).
        Random Forest performed better on the test set, indicating that it generalizes better to unseen data.

    Generalization:
        Gradient Boosting had better cross-validation accuracy, but it underperformed on the test set. This suggests that it may have overfit the training data slightly during cross-validation.
        Random Forest strikes a better balance between training and test performance, making it a more reliable choice.

    Pipeline Steps:
        Imputation: Handles missing values with the most frequent value in the dataset, ensuring that the model can process all data.
        Encoding: One-hot encoding ensures that categorical variables are transformed into numerical format without introducing bias.
        Model: Random Forest is robust to overfitting due to its ensemble nature, especially on datasets with a mix of numerical and categorical features.

    Practicality:
        Random Forest is less sensitive to hyperparameter tuning compared to Gradient Boosting, making it easier to implement and optimize for real-world problems.

Conclusion:

    Random Forest was chosen because of its higher test accuracy and better generalization.
    While Gradient Boosting had better cross-validation performance, its test accuracy indicates it might not generalize as well as Random Forest in this case.
    Random Forest's robustness and ability to handle a mix of features effectively make it the superior choice for this problem.



The code evaluates two machine learning models (Random Forest and Gradient Boosting) on the Titanic dataset using a pipeline that includes:

    Preprocessing Steps:
        SimpleImputer: Fills missing values in the dataset using the most frequent value.
        OneHotEncoder: Encodes categorical variables into numerical format using one-hot encoding.

    Model Training:
        Uses both Random Forest and Gradient Boosting models.

    Cross-Validation:
        Calculates the average accuracy across 5 folds to evaluate model stability.

    Test Performance:
        Evaluates each model on a held-out test set to calculate accuracy.

    Best Model Selection:
        The model with the highest test accuracy is stored as the best_model.

Performance Output:

For each model, it prints:

    Cross-validation Accuracy: Average accuracy across the 5 folds.
    Test Accuracy: Accuracy on the test data.

Finally, it identifies and prints the best-performing model based on test accuracy.