# **ML Pipeline with Scikit-learn**

### **Install Libraries**

In [None]:
%pip install pandas scikit-learn joblib

### **Load and Explore the Dataset**

In [2]:
import pandas as pd

# Load dataset
df = pd.read_csv("Telco-Customer-Churn.csv")

# Quick look
print(df.shape)
print(df.head())


(7043, 21)
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Co

### **Data Preprocessing with Pipelines**

**Separate features and target**

In [3]:
X = df.drop('Churn', axis=1)
y = df['Churn'].map({'Yes': 1, 'No': 0})  # Convert to binary


**Identify column types**

In [4]:
from sklearn.compose import make_column_selector as selector

categorical_features = selector(dtype_include='object')(X)
numerical_features = selector(dtype_exclude='object')(X)

print("Categorical:", categorical_features)
print("Numerical:", numerical_features)


Categorical: ['customerID', 'gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'TotalCharges']
Numerical: ['SeniorCitizen', 'tenure', 'MonthlyCharges']


**Define preprocessing pipelines**

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Preprocessing for numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

# Combine preprocessors
preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numerical_features),
    ("cat", categorical_transformer, categorical_features)
])


### **Build Pipelines for ML Models**

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Logistic Regression pipeline
lr_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

# Random Forest pipeline
rf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier())
])


### **Train & Evaluate the Models**

In [7]:
from sklearn.model_selection import train_test_split, cross_val_score

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train
lr_pipeline.fit(X_train, y_train)
rf_pipeline.fit(X_train, y_train)

# Evaluate
print("LR Accuracy:", lr_pipeline.score(X_test, y_test))
print("RF Accuracy:", rf_pipeline.score(X_test, y_test))


LR Accuracy: 0.8261178140525195
RF Accuracy: 0.7970191625266146


### **Hyperparameter Tuning with GridSearchCV**

In [8]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "classifier__n_estimators": [100, 200],
    "classifier__max_depth": [5, 10, None]
}

# Use only with rf_pipeline
grid_search = GridSearchCV(rf_pipeline, param_grid, cv=3, n_jobs=-1, scoring="accuracy")
grid_search.fit(X_train, y_train)

print("Best Params:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)


Best Params: {'classifier__max_depth': None, 'classifier__n_estimators': 200}
Best Accuracy: 0.7949946751863685


### **Export the Best Pipeline**

In [9]:
import joblib

# Save the best pipeline
joblib.dump(grid_search.best_estimator_, "churn_pipeline.pkl")


['churn_pipeline.pkl']