In [1]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

"""
Logistic Regression is a classification algorithm 
used to predict the probability of categorical outcomes based on input features.
It is commonly used when the target variable is binary (two classes),
but it can also be extended to handle multi-class classification problems.

What makes it logistic regression is the use of the logistic function (also known as the sigmoid function) 
to model the relationship between the input features and the probability of belonging to a specific class. 
The logistic function maps any real-valued number to a value between 0 and 1, 
representing the probability of the input belonging to the positive class.
"""

# Load the wine dataset

data = load_wine()

# Split the dataset into features (X) and target variable (y)
X = data.data
y = data.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a LogisticRegression model
model = LogisticRegression()

# Create a LogisticRegression model with different parameters
# model = LogisticRegression(penalty='l1', C=0.1, solver='liblinear')

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


"""
The cross_val_score function splits the data into multiple folds, 
trains the model on different subsets of the data, 
and evaluates the model's performance on the remaining fold. 
It returns an array of scores, one for each fold. 
We print the cross-validation scores and calculate the average score using the mean() method.

Cross-validation helps to provide a more reliable estimate of the model's performance 
by evaluating it on multiple subsets of the data. 
It gives you an idea of how the model performs on unseen data 
"""

# Perform cross-validation
cv_scores = cross_val_score(model, X, y, cv=5)  # 5-fold cross-validation

# Print the cross-validation scores
print("Cross-Validation Scores:", cv_scores)
print("Average Score:", cv_scores.mean())

"""
After the grid search is complete, 
we can print the best hyperparameters and the corresponding best score 
using best_params_ and best_score_ attributes, respectively. 
We can also access the best model 
using best_estimator_ 
and evaluate its performance on the test set using score
"""

# Define the hyperparameters to tune
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga']
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Evaluate the model on the test set
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print("Test Score:", test_score)

# Create an SVC model
model = SVC()

# Define the hyperparameters to tune
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf']
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5) # 5-fold cross validation
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and score
print("Best Hyperparameters for SVC:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Evaluate the model on the test set
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print("Test Score:", test_score)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.9722222222222222


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Cross-Validation Scores: [0.88888889 0.94444444 0.94444444 1.         1.        ]
Average Score: 0.9555555555555555




Best Hyperparameters: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
Best Score: 0.9507389162561577
Test Score: 0.9722222222222222
Best Hyperparameters for SVC: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Best Score: 0.9576354679802955
Test Score: 1.0


In [2]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
import numpy as np

# Create a SVC classifier
svc = SVC()

# Define a range of hyperparameters and their possible values
param_dist = {
    'C': np.logspace(-3, 3, 7),       # Range of values for the regularization parameter
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Different kernel types
    'gamma': ['scale', 'auto'] + list(np.logspace(-3, 3, 7)),  # Values for gamma
    'degree': [2, 3, 4],              # Degree for polynomial kernel
}

# Create a RandomizedSearchCV instance
# Number of jobs to run in parallel : -1 means using all available CPU cores for parallel computation
random_search = RandomizedSearchCV(svc, param_distributions=param_dist, n_iter=100, cv=5, n_jobs=-1)

# Fit the random search to your training data
random_search.fit(X_train, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters:", random_search.best_params_)

# Evaluate the model with the best hyperparameters on the test data
best_svc = random_search.best_estimator_
accuracy = best_svc.score(X_test, y_test)
print("Test Accuracy:", accuracy)


Best Hyperparameters: {'kernel': 'linear', 'gamma': 0.001, 'degree': 4, 'C': 0.1}
Test Accuracy: 1.0


In [4]:
import pandas as pd

# Create a DataFrame from the diabetes dataset for the features and target variable
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

# Display the DataFrame
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [5]:
df.describe()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258,0.938202
std,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474,0.775035
min,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0,0.0
25%,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5,0.0
50%,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5,1.0
75%,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0,2.0
max,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0,2.0
