In [45]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

# Define column names for the dataset
column_names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']

# Load the Pima Indians Diabetes dataset.
dataset = pd.read_csv('../data/pima-indians-diabetes.csv', names=column_names, header=0)

In [46]:
dataset.shape

(767, 9)

In [47]:
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1,85,66,29,0,26.6,0.351,31,0
1,8,183,64,0,0,23.3,0.672,32,1
2,1,89,66,23,94,28.1,0.167,21,0
3,0,137,40,35,168,43.1,2.288,33,1
4,5,116,74,0,0,25.6,0.201,30,0


In [49]:
from sklearn.neighbors import KNeighborsClassifier

# Split the dataset into features (X) and the target variable (y)
X = dataset.drop('Outcome', axis=1)
y = dataset['Outcome']

# Split the data into a training set and a testing set (e.g., 70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale the features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create and train the logistic regression model
model = LogisticRegression(solver='liblinear', random_state=42)
# model = KNeighborsClassifier()
# model.fit(X_train, y_train)

pipeline = Pipeline(steps=[('t', trans), ('m', model)])

# evaluate the pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
# report pipeline performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Accuracy: 0.770 (0.040)


You can use `PolynomialFeatures` from scikit-learn to generate polynomial features for your dataset. The code below provides an example. In this code, we first apply `PolynomialFeatures` to generate polynomial features of the desired degree (in this case, 2) and then proceed with the standard scaling and logistic regression as before. The polynomial features allow you to capture non-linear relationships in your data, which can be helpful in certain scenarios. Adjust the `degree` parameter in `PolynomialFeatures` as needed based on your specific dataset and problem requirements.

In [40]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Split the dataset into features (X) and the target variable (y)
X = dataset.drop('Outcome', axis=1)
y = dataset['Outcome']

# Split the data into a training set and a testing set (e.g., 70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create and fit the PolynomialFeatures transformer to generate polynomial features
poly = PolynomialFeatures(degree=9)  # You can change the degree as needed
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Scale the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_poly)
X_test_scaled = scaler.transform(X_test_poly)

# Create and train the logistic regression model
# model = LogisticRegression(solver='lbfgs', random_state=42)
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")
print("Confusion Matrix:\n", confusion)
print("Classification Report:\n", classification_rep)

Accuracy: 70.56%
Confusion Matrix:
 [[126  25]
 [ 43  37]]
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.83      0.79       151
           1       0.60      0.46      0.52        80

    accuracy                           0.71       231
   macro avg       0.67      0.65      0.65       231
weighted avg       0.69      0.71      0.70       231



In [42]:
# evaluate knn on the sonar dataset with polynomial features transform
from numpy import mean
from numpy import std
from pandas import read_csv
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from matplotlib import pyplot
# load dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.csv"
dataset = read_csv(url, header=None)
data = dataset.values
# separate into input and output columns
X, y = data[:, :-1], data[:, -1]
# ensure inputs are floats and output is an integer label
X = X.astype('float32')
y = LabelEncoder().fit_transform(y.astype('str'))
# define the pipeline
trans = PolynomialFeatures(degree=3)
model = KNeighborsClassifier()
pipeline = Pipeline(steps=[('t', trans), ('m', model)])
# evaluate the pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
# report pipeline performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Accuracy: 0.800 (0.077)


In [54]:
np.logspace(-4, 1, 6)

array([1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01])

In [66]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA

# Split the dataset into features (X) and the target variable (y)
X = dataset.drop('Outcome', axis=1)
y = dataset['Outcome']

pipe = Pipeline([
        ('scale', StandardScaler()),
        ('reduce_dims', PCA()),
        ('clf', SVC())])

param_grid = dict(reduce_dims__n_components=[4,6,8],
                  clf__C=np.logspace(-4, 1, 6),
                  clf__kernel=['rbf','linear'])

grid = GridSearchCV(pipe, param_grid=param_grid, cv=3, n_jobs=1, verbose=2, scoring= 'accuracy')
grid.fit(X, y)

# print("Best Hyperparameters: ", best_params)
print("Best Accuracy: ", grid.best_score_)
print("Best params: ", grid.best_params_)

# print(grid.best_score_)
# print(grid.cv_results_)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
[CV] END clf__C=0.0001, clf__kernel=rbf, reduce_dims__n_components=4; total time=   0.0s
[CV] END clf__C=0.0001, clf__kernel=rbf, reduce_dims__n_components=4; total time=   0.0s
[CV] END clf__C=0.0001, clf__kernel=rbf, reduce_dims__n_components=4; total time=   0.0s
[CV] END clf__C=0.0001, clf__kernel=rbf, reduce_dims__n_components=6; total time=   0.0s
[CV] END clf__C=0.0001, clf__kernel=rbf, reduce_dims__n_components=6; total time=   0.0s
[CV] END clf__C=0.0001, clf__kernel=rbf, reduce_dims__n_components=6; total time=   0.0s
[CV] END clf__C=0.0001, clf__kernel=rbf, reduce_dims__n_components=8; total time=   0.0s
[CV] END clf__C=0.0001, clf__kernel=rbf, reduce_dims__n_components=8; total time=   0.0s
[CV] END clf__C=0.0001, clf__kernel=rbf, reduce_dims__n_components=8; total time=   0.0s
[CV] END clf__C=0.0001, clf__kernel=linear, reduce_dims__n_components=4; total time=   0.0s
[CV] END clf__C=0.0001, clf__kernel=linear, r

In [56]:
pip install optuna

Collecting optuna
  Downloading optuna-3.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.12.1-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting sqlalchemy>=1.3.0 (from optuna)
  Downloading SQLAlchemy-2.0.23-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting tqdm (from optuna)
  Downloading tqdm-4.66.1-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hCollecting PyYAML (from optuna)
  Downloading PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.1 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m2.4 MB/s[0m eta [3

In [63]:
import optuna
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
from sklearn.metrics import make_scorer

# Split the dataset into features (X) and the target variable (y)
X = dataset.drop('Outcome', axis=1)
y = dataset['Outcome']

def objective(trial):
    # Define the hyperparameters to search for
    reduce_dims_n_components = trial.suggest_int('reduce_dims__n_components', 2, 8)
    clf_C = trial.suggest_loguniform('clf__C', 1e-4, 1e1)
    clf_kernel = trial.suggest_categorical('clf__kernel', ['rbf', 'linear'])

    # Create the pipeline with the suggested hyperparameters
    pipe = Pipeline([
        ('scale', StandardScaler()),
        ('reduce_dims', PCA(n_components=reduce_dims_n_components)),
        ('clf', SVC(C=clf_C, kernel=clf_kernel))])

    # Use cross-validation to evaluate the pipeline's performance
    scoring = make_scorer(accuracy_score)  # You can change the scoring method here
    scores = cross_val_score(pipe, X, y, cv=3, scoring=scoring)

    # Return the negative mean accuracy, as Optuna minimizes the objective function
    return -scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=500)

# Print the best hyperparameters and the corresponding accuracy
best_params = study.best_params
best_score = -study.best_value
print("Best Hyperparameters: ", best_params)
print("Best Accuracy: ", best_score)


[I 2023-11-07 11:29:08,454] A new study created in memory with name: no-name-b1e5cea2-186d-4849-9248-40fd2ba4af28
  clf_C = trial.suggest_loguniform('clf__C', 1e-4, 1e1)
[I 2023-11-07 11:29:08,509] Trial 0 finished with value: -0.7640369689542483 and parameters: {'reduce_dims__n_components': 6, 'clf__C': 1.8996478873044031, 'clf__kernel': 'linear'}. Best is trial 0 with value: -0.7640369689542483.
  clf_C = trial.suggest_loguniform('clf__C', 1e-4, 1e1)
[I 2023-11-07 11:29:08,554] Trial 1 finished with value: -0.7157832924836601 and parameters: {'reduce_dims__n_components': 2, 'clf__C': 0.07699879975341176, 'clf__kernel': 'rbf'}. Best is trial 1 with value: -0.7157832924836601.
  clf_C = trial.suggest_loguniform('clf__C', 1e-4, 1e1)
[I 2023-11-07 11:29:08,601] Trial 2 finished with value: -0.766641135620915 and parameters: {'reduce_dims__n_components': 8, 'clf__C': 0.18418048862295408, 'clf__kernel': 'rbf'}. Best is trial 1 with value: -0.7157832924836601.
  clf_C = trial.suggest_loguni

Best Hyperparameters:  {'reduce_dims__n_components': 3, 'clf__C': 0.0017944133277856945, 'clf__kernel': 'linear'}
Best Accuracy:  0.6505872140522876
