In [1]:
#Importing Dependencies 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#Loading the data from SQL databases for Red and White Wines
# Connect to SQLite database and retrieve red wine data
conn = sqlite3.connect('red_wine_quality.db')
red_wine_df = pd.read_sql_query("SELECT * FROM red_wine_quality", conn)
conn.close()

# Connect to SQLite database and retrieve white wine data
conn = sqlite3.connect('white_wine_quality.db')
white_wine_df = pd.read_sql_query("SELECT * FROM white_wine_quality", conn)
conn.close()

In [3]:
# Defining a function to Train, predict and extract the final metrics (Confusion matrix & Classification Report)
def train_and_evaluate(df):
    # Setting up a binary identifier for quality
    df['quality'] = df['quality'].apply(lambda x: 1 if x >= 7 else 0)

    # Separating features and target
    X = df.drop(['quality', 'type'], axis=1)
    y = df['quality']

    # Splitting the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Scaling the features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Training initial model
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    y_pred_initial = model.predict(X_test)

    # Evaluating initial model
    conf_matrix_initial = confusion_matrix(y_test, y_pred_initial)
    class_report_initial = classification_report(y_test, y_pred_initial, output_dict=True)
    accuracy_initial = accuracy_score(y_test, y_pred_initial)

    # Hyperparameter tuning
    param_grid = {
        'n_estimators': [100, 200, 300, 400, 500],
        'max_features': ['auto', 'sqrt', 'log2'],
        'max_depth': [4, 6, 8, 10, 12],
        'criterion': ['gini', 'entropy']
    }
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_

    # Training optimized model
    best_model = RandomForestClassifier(**best_params, random_state=42)
    best_model.fit(X_train, y_train)
    y_pred_optimized = best_model.predict(X_test)

    # Evaluating optimized model
    conf_matrix_optimized = confusion_matrix(y_test, y_pred_optimized)
    class_report_optimized = classification_report(y_test, y_pred_optimized, output_dict=True)
    accuracy_optimized = accuracy_score(y_test, y_pred_optimized)

    return {
        'initial': {
            'conf_matrix': conf_matrix_initial,
            'class_report': class_report_initial,
            'accuracy': accuracy_initial
        },
        'optimized': {
            'conf_matrix': conf_matrix_optimized,
            'class_report': class_report_optimized,
            'accuracy': accuracy_optimized
        }
    }

In [4]:
# Another Function to extract and define which metrics are recorded
def extract_metrics(conf_matrix, class_report, accuracy):
    metrics = {
        'Precision': class_report['weighted avg']['precision'],
        'Recall': class_report['weighted avg']['recall'],
        'F1-Score': class_report['weighted avg']['f1-score'],
        'Support': class_report['weighted avg']['support'],
        'Accuracy': accuracy,
        'Predicted Positive Actual Positive': conf_matrix[1, 1],
        'Predicted Positive Actual Negative': conf_matrix[0, 1],
        'Predicted Negative Actual Positive': conf_matrix[1, 0],
        'Predicted Negative Actual Negative': conf_matrix[0, 0]
    }
    return metrics

In [5]:
#Process the Red and White Wine data

# Process Red Wine
red_wine_results = train_and_evaluate(red_wine_df)

# Process White Wine
white_wine_results = train_and_evaluate(white_wine_df)

Fitting 5 folds for each of 150 candidates, totalling 750 fits


250 fits failed out of a total of 750.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
175 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\james\anaconda3\envs\dev\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\james\anaconda3\envs\dev\lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "c:\Users\james\anaconda3\envs\dev\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\james\anaconda3\envs\dev\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter

Fitting 5 folds for each of 150 candidates, totalling 750 fits


250 fits failed out of a total of 750.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
178 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\james\anaconda3\envs\dev\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\james\anaconda3\envs\dev\lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "c:\Users\james\anaconda3\envs\dev\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\james\anaconda3\envs\dev\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter

In [6]:
#Creating a dataframe for the results
# Extract metrics for initial and optimized models for red wine
red_initial_metrics = extract_metrics(red_wine_results['initial']['conf_matrix'], red_wine_results['initial']['class_report'], red_wine_results['initial']['accuracy'])
red_optimized_metrics = extract_metrics(red_wine_results['optimized']['conf_matrix'], red_wine_results['optimized']['class_report'], red_wine_results['optimized']['accuracy'])

# Extract metrics for initial and optimized models for white wine
white_initial_metrics = extract_metrics(white_wine_results['initial']['conf_matrix'], white_wine_results['initial']['class_report'], white_wine_results['initial']['accuracy'])
white_optimized_metrics = extract_metrics(white_wine_results['optimized']['conf_matrix'], white_wine_results['optimized']['class_report'], white_wine_results['optimized']['accuracy'])

# Create DataFrame
results_df = pd.DataFrame({
    'Red Wine Initial': red_initial_metrics,
    'Red Wine Optimized': red_optimized_metrics,
    'White Wine Initial': white_initial_metrics,
    'White Wine Optimized': white_optimized_metrics
}).T

print(results_df)

                      Precision    Recall  F1-Score  Support  Accuracy  \
Red Wine Initial       0.891574  0.900000  0.892500    320.0  0.900000   
Red Wine Optimized     0.902702  0.909375  0.901983    320.0  0.909375   
White Wine Initial     0.888151  0.890816  0.884969    980.0  0.890816   
White Wine Optimized   0.882145  0.885714  0.879978    980.0  0.885714   

                      Predicted Positive Actual Positive  \
Red Wine Initial                                    24.0   
Red Wine Optimized                                  25.0   
White Wine Initial                                 145.0   
White Wine Optimized                               144.0   

                      Predicted Positive Actual Negative  \
Red Wine Initial                                     9.0   
Red Wine Optimized                                   7.0   
White Wine Initial                                  25.0   
White Wine Optimized                                29.0   

                      Predi

In [9]:
results_df.to_csv('wine_quality_results_RFC.csv')