In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

# 1. Load dataset
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Create pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('classifier', SVC())
])

# 3. Define parameter grid
param_grid = {
    'pca__n_components': [2, 3],
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf']
}

# 4. GridSearchCV
grid = GridSearchCV(pipe, param_grid)
grid.fit(X_train, y_train)

# 5. Results
print("Best parameters found:", grid.best_params_)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Test set score: {:.2f}".format(grid.score(X_test, y_test)))


Check for 3 fold, 5 fold and 7 fold cross validation

Replace classifier, SVC with RandomForestClassifier and LogisticRegression, Perceptron, knn .

Update the param_grid accordingly (e.g., for RandomForestClassifier, use n_estimators, max_depth, etc.)

Also replace Gridsearch with randomnsearch function.

Relplace with with your own csv dataset using code below:

In [19]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from scipy.stats import randint
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.neighbors import KNeighborsClassifier

# 1. Load dataset and clean columns
df = pd.read_csv('datasets_ICP5/hypertension_dataset.csv')
df.columns = df.columns.str.strip()

# 2. Features and target split
X = df.drop('Has_Hypertension', axis=1)
y = df['Has_Hypertension']

# 3. Column types   num and cat columns for data preprocessing
numerical_cols = ['Age', 'Salt_Intake', 'Stress_Score', 'Sleep_Duration', 'BMI']
categorical_cols = ['BP_History', 'Medication', 'Family_History', 'Exercise_Level', 'Smoking_Status']

# 4. Preprocessing    -using columTransformer
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
])
     #defined models and their hyperparameters
models = {
    "RandomForest": (RandomForestClassifier(), {
        'classifier__n_estimators': randint(50, 200),
        'classifier__max_depth': randint(3, 20)
    }),
    "LogisticRegression": (LogisticRegression(max_iter=1000), {
        'classifier__C': [0.01, 0.1, 1, 10],
        'classifier__penalty': ['l2']
    }),
    "Perceptron": (Perceptron(max_iter=1000), {
        'classifier__penalty': ['l2', 'elasticnet'],
        'classifier__alpha': [0.0001, 0.001, 0.01]
    }),
    "KNN": (KNeighborsClassifier(), {
        'classifier__n_neighbors': [3, 5, 7, 9],
        'classifier__weights': ['uniform', 'distance']
    })
}

# performing hyperparameter tuning using randomized searchCV with different cross-validation folds
for cv_fold in [3, 5, 7]:
    print(f"\n=== Cross-validation: {cv_fold}-fold ===")
    for name, (clf, param_dist) in models.items():
        pipe = Pipeline([
            ('preprocessor', preprocessor),
            ('pca', PCA()),
            ('classifier', clf)
        ])
        search = RandomizedSearchCV(pipe, {
            'pca__n_components': [2, 3, 5],
            **param_dist
        }, n_iter=10, cv=cv_fold, random_state=42)
        search.fit(X_train, y_train)
        print(f"\n{name} | Best Params: {search.best_params_}")
        print(f"{name} | CV Score: {search.best_score_:.2f}")
        print(f"{name} | Test Score: {search.score(X_test, y_test):.2f}")
# 5. Pipeline with PCA and SVM
pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('pca', PCA(n_components=5)),
    ('classifier', SVC())
])

# 6. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)

# 7. Fit pipeline
pipe.fit(X_train, y_train)

# 8. Predict and evaluate
y_pred = pipe.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


=== Cross-validation: 3-fold ===

RandomForest | Best Params: {'classifier__max_depth': 9, 'classifier__n_estimators': 171, 'pca__n_components': 5}
RandomForest | CV Score: 0.65
RandomForest | Test Score: 0.65

LogisticRegression | Best Params: {'pca__n_components': 5, 'classifier__penalty': 'l2', 'classifier__C': 0.01}
LogisticRegression | CV Score: 0.64
LogisticRegression | Test Score: 0.64

Perceptron | Best Params: {'pca__n_components': 5, 'classifier__penalty': 'elasticnet', 'classifier__alpha': 0.0001}
Perceptron | CV Score: 0.62
Perceptron | Test Score: 0.61

KNN | Best Params: {'pca__n_components': 5, 'classifier__weights': 'uniform', 'classifier__n_neighbors': 5}
KNN | CV Score: 0.63
KNN | Test Score: 0.61

=== Cross-validation: 5-fold ===

RandomForest | Best Params: {'classifier__max_depth': 6, 'classifier__n_estimators': 153, 'pca__n_components': 5}
RandomForest | CV Score: 0.65
RandomForest | Test Score: 0.65

LogisticRegression | Best Params: {'pca__n_components': 5, 'cl