In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

# Generate a synthetic dataset
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)

# Convert to DataFrame for convenience
X = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(20)])

# Step 2: Preprocess the data
# Identify numerical and categorical columns
num_features = X.select_dtypes(include=['int64', 'float64']).columns
cat_features = X.select_dtypes(include=['object']).columns

# Create a column transformer with preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), num_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]), cat_features)
    ]
)

# Step 3: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Define the models and hyperparameters for Grid Search
models = {
    'LogisticRegression': LogisticRegression(),
    'DecisionTree': DecisionTreeClassifier(),
    'RandomForest': RandomForestClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier()
}

param_grid = {
    'LogisticRegression': {
        'model__C': [0.01, 0.1, 1],
        'model__solver': ['liblinear']
    },
    'DecisionTree': {
        'model__max_depth': [10, 20],
        'model__min_samples_split': [2, 5]
    },
    'RandomForest': {
        'model__n_estimators': [10, 50],
        'model__max_depth': [10, 20],
        'model__min_samples_split': [2, 5]
    },
    'SVM': {
        'model__C': [0.1, 1],
        'model__kernel': ['linear']
    },
    'KNN': {
        'model__n_neighbors': [3, 5],
        'model__weights': ['uniform']
    }
}

# Step 5: Perform Grid Search and evaluate models
best_models = {}
results = {}
for model_name in models:
    print(f"Performing Grid Search for {model_name}...")
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', models[model_name])])
    grid_search = GridSearchCV(pipeline, param_grid[model_name], cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    best_models[model_name] = grid_search.best_estimator_
    y_pred = grid_search.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[model_name] = {
        'best_params': grid_search.best_params_,
        'accuracy': accuracy,
        'classification_report': classification_report(y_test, y_pred, output_dict=True)
    }
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Accuracy: {accuracy}")
    print(classification_report(y_test, y_pred))

# Step 6: Compare results
print("Summary of best models and their performance:")
for model_name, result in results.items():
    print(f"{model_name}:")
    print(f"Best parameters: {result['best_params']}")
    print(f"Accuracy: {result['accuracy']}")
    print(pd.DataFrame(result['classification_report']).transpose())
    print()


Performing Grid Search for LogisticRegression...
Best parameters for LogisticRegression: {'model__C': 0.01, 'model__solver': 'liblinear'}
Accuracy: 0.85
              precision    recall  f1-score   support

           0       0.78      0.94      0.85        93
           1       0.93      0.78      0.85       107

    accuracy                           0.85       200
   macro avg       0.86      0.86      0.85       200
weighted avg       0.86      0.85      0.85       200

Performing Grid Search for DecisionTree...
Best parameters for DecisionTree: {'model__max_depth': 10, 'model__min_samples_split': 2}
Accuracy: 0.86
              precision    recall  f1-score   support

           0       0.82      0.90      0.86        93
           1       0.91      0.82      0.86       107

    accuracy                           0.86       200
   macro avg       0.86      0.86      0.86       200
weighted avg       0.86      0.86      0.86       200

Performing Grid Search for RandomForest...
Be