In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Generate a sample classification dataset
X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, random_state=42)


In [3]:
# Convert to DataFrame for simulation
df = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(10)])
df['target'] = y

In [4]:
# Simulate missing values
df.loc[::10, 'feature_1'] = np.nan
# Add a categorical column (for demonstration)
df['City'] = np.random.choice(['Delhi', 'Mumbai', 'Chennai'], size=1000)
df['Gender'] = np.random.choice(['Male', 'Female'], size=1000)

In [5]:
# Split into features and target
X = df.drop('target', axis=1)
y = df['target']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# Identify feature types
num_features = X.select_dtypes(include=['float64', 'int']).columns.tolist()
cat_features = ['City', 'Gender']

# Define pipelines for each type
num_processor = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])


In [7]:
cat_processor = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [8]:
# Combine both
preprocessor = ColumnTransformer([
    ('num', num_processor, num_features),
    ('cat', cat_processor, cat_features)
])

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score


In [10]:
# Full pipeline
model_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])


In [11]:
# Cross-validation
cv_scores = cross_val_score(model_pipeline, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", np.mean(cv_scores))

# Hyperparameter tuning
param_grid = {
    'classifier__C': [0.01, 0.1, 1, 10],
    'classifier__solver': ['lbfgs']
}

grid = GridSearchCV(model_pipeline, param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)


Cross-validation scores: [0.84375 0.83125 0.825   0.80625 0.8    ]
Mean CV accuracy: 0.82125


In [12]:
print("Best Parameters:", grid.best_params_)

Best Parameters: {'classifier__C': 0.01, 'classifier__solver': 'lbfgs'}


In [13]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay


In [14]:
# Predictions and Probabilities
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]


In [15]:
# Metrics
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_proba))

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.82      0.84       112
           1       0.79      0.84      0.81        88

    accuracy                           0.83       200
   macro avg       0.83      0.83      0.83       200
weighted avg       0.83      0.83      0.83       200

Confusion Matrix:
 [[92 20]
 [14 74]]
ROC AUC Score: 0.9073660714285715
