In [1]:
#Author: Michael Elgin (melgin@uwyo.edu)
#2023_10_11

#Notebook for ML Algorithm Selection

In [2]:
#Modules

#Python 3.11.2
import numpy as np #1.26.0
import pandas as pd #1.5.3
from sklearn.model_selection import train_test_split #1.3.0

In [3]:
#Part 1 - Algorithm selection for regression

In [4]:
#Wine color shall be chosen based on which dataset has larger N

df_red = pd.read_csv("data/winequality-red.csv", sep=";")
df_white = pd.read_csv("data/winequality-white.csv", sep=";")

print("N for red wine: {0}".format(len(df_red)))
print("N for white wine: {0}".format(len(df_white)))

N for red wine: 1599
N for white wine: 4898


In [5]:
#All algorithms up for selection will now be evaluated on the white wine dataset

In [6]:
#Next the white wine data will be split into training and test sets
#For fairness in algorithm selection, these will be the same sets used to train and test all models
all_white_wine = df_white.to_numpy()
train, test = train_test_split(all_white_wine, test_size=0.2, random_state=0) #Remember - also shuffles
X_train = train[:, 0:-1] #Features
y_train = train[:, -1] #Target
X_test = test[:, 0:-1] #Features
y_test = test[:, -1] #Target

In [7]:
#The performance metric here will be accuracy defined by rounding the regression score to the nearest whole number,
#if that rounded number matches the quality, it is considered correct.
def evaluate_regressions(predictions:np.ndarray, y_test:np.ndarray) -> float:
    """
    predictions is numpy array of regression values predicted by a model
    returns the percentage of these rounded values that matched that real wine quality
    """
    predictions = np.round(predictions)
    matches = predictions == y_test
    return matches.sum()/len(matches) * 100 #Percent correct

In [8]:
#First a baseline "model" will be created
#This will essentially be a model that merely predicts the dataset mode (most frequent value)
from scipy import stats #1.10.1
mode = stats.mode(y_test).mode[0] #Just the mode of the test set
y_base = np.array([mode for sample in X_test])
acc_base = evaluate_regressions(y_base, y_test)
print("Baseline accuracy for regression: {0:.3f}%".format(acc_base))

Baseline accuracy for regression: 41.735%


  mode = stats.mode(y_test).mode[0] #Just the mode of the test set


In [9]:
#The above percentage is what every model should seek to beat

In [10]:
#Model 1 - linear model
from sklearn.linear_model import LinearRegression

#Training
LR_model = LinearRegression()
LR_model.fit(X_train, y_train)

#Evaluation
y_pred_LR = LR_model.predict(X_test)
acc_LR = evaluate_regressions(y_pred_LR, y_test)
print("Linear model's accuracy for regression: {0:.3f}%".format(acc_LR))

Linear model's accuracy for regression: 48.163%


In [11]:
#Model 2 - Decision Tree
from sklearn.tree import DecisionTreeRegressor

#Training
DTR_model = DecisionTreeRegressor(random_state=0)
DTR_model.fit(X_train, y_train)

#Evaluation
y_pred_DTR = DTR_model.predict(X_test)
acc_DTR = evaluate_regressions(y_pred_DTR, y_test)
print("Decision tree's accuracy for regression: {0:.3f}%".format(acc_DTR))

Decision tree's accuracy for regression: 57.959%


In [12]:
#Model 3 - Random Forest
from sklearn.ensemble import RandomForestRegressor

#Training
RFR_model = RandomForestRegressor(random_state=0)
RFR_model.fit(X_train, y_train)

#Evaluation
y_pred_RFR = RFR_model.predict(X_test)
acc_RFR = evaluate_regressions(y_pred_RFR, y_test)
print("Random Forest's accuracy for regression: {0:.3f}%".format(acc_RFR))

Random Forest's accuracy for regression: 63.673%


In [13]:
#Model 4 - Generalized Additive Model
from pygam import LinearGAM #0.9.0
from pygam import s #This is the smoothing function (cubic spline) to be used for continuous features

#Training
GAM_model = LinearGAM(
    s(0) + 
    s(1) +
    s(2) +
    s(3) +
    s(4) +
    s(5) +
    s(6) +
    s(7) +
    s(8) +
    s(9),
    n_splines=50#Each feature is allowed a max amt of this many splines
)
GAM_model.fit(X_train, y_train)

#Evaluation
y_pred_GAM = GAM_model.predict(X_test)
acc_GAM = evaluate_regressions(y_pred_GAM, y_test)
print("GAM's accuracy for regression: {0:.3f}%".format(acc_GAM))

GAM's accuracy for regression: 50.816%


In [23]:
#Part 2 - Algorithm selection for classification

In [15]:
#Construct Classification Dataset

#Target must now become red or white, not the score
df_red = df_red.drop(columns="quality")
df_red['color'] = 0 #0 means red
df_white = df_white.drop(columns="quality")
df_white['color'] = 1 #1 means white

all_data = np.vstack((df_red.to_numpy(), df_white.to_numpy()))

train, test = train_test_split(all_data, test_size=0.2, random_state=0)
X_train = train[:, 0:-1] #Features
y_train = train[:, -1] #Target
X_test = test[:, 0:-1] #Features
y_test = test[:, -1] #Target

In [16]:
#The performance metric here will be accuracy defined by the amount of correct classifications divided by the total.
def evaluate_classifications(predictions:np.ndarray, y_test:np.ndarray) -> float:
    """
    predictions is numpy array of classification values predicted by a model
    returns the percentage of these values that matched that real wine color
    """
    matches = predictions == y_test
    return matches.sum()/len(matches) * 100 #Percent correct

In [17]:
#Model 1 - Support Vector Classifier
from sklearn.svm import SVC

#Training
SVC_model = SVC(random_state=0)
SVC_model.fit(X_train, y_train)

#Evaluation
y_pred_SVC = SVC_model.predict(X_test)
acc_SVC = evaluate_classifications(y_pred_SVC, y_test)
print("SVC's accuracy for classification: {0:.3f}%".format(acc_SVC))

SVC's accuracy for classification: 93.538%


In [18]:
#Model 2 - Logistic regression
from sklearn.linear_model import LogisticRegression

#Training
Log_model = LogisticRegression(random_state=0)
Log_model.fit(X_train, y_train)

#Evaluation
y_pred_Log = Log_model.predict(X_test)
acc_Log = evaluate_classifications(y_pred_Log, y_test)
print("Logistic regression accuracy for classification: {0:.3f}%".format(acc_Log))

Logistic regression accuracy for classification: 98.692%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
#Model 3 - Decision Tree classifier
from sklearn.tree import DecisionTreeClassifier

#Training
DTC_model = DecisionTreeClassifier(random_state=0)
DTC_model.fit(X_train, y_train)

#Evaluation
y_pred_DTC = DTC_model.predict(X_test)
acc_DTC = evaluate_classifications(y_pred_DTC, y_test)
print("Decision tree accuracy for classification: {0:.3f}%".format(acc_DTC))

Decision tree accuracy for classification: 98.231%


In [20]:
#Model 4 - K-nearest neighbor
from sklearn.neighbors import KNeighborsClassifier

#Training
KNN_model = KNeighborsClassifier()
KNN_model.fit(X_train, y_train)

#Evaluation
y_pred_KNN = KNN_model.predict(X_test)
acc_KNN = evaluate_classifications(y_pred_KNN, y_test)
print("KNN accuracy for classification: {0:.3f}%".format(acc_KNN))

KNN accuracy for classification: 94.923%


In [21]:
#Model 5 - Naive Bayes
from sklearn.naive_bayes import GaussianNB

#Training
GNB_model = GaussianNB()
GNB_model.fit(X_train, y_train)

#Evaluation
y_pred_GNB = GNB_model.predict(X_test)
acc_GNB = evaluate_classifications(y_pred_GNB, y_test)
print("Gaussian Naive Bayes accuracy for classification: {0:.3f}%".format(acc_GNB))

Gaussian Naive Bayes accuracy for classification: 97.615%


In [22]:
#Model 6 - Random Forest classifier
from sklearn.ensemble import RandomForestClassifier

#Training
RFC_model = RandomForestClassifier(random_state=0)
RFC_model.fit(X_train, y_train)

#Evaluation
y_pred_RFC = RFC_model.predict(X_test)
acc_RFC = evaluate_classifications(y_pred_RFC, y_test)
print("Random Forest's accuracy for classification: {0:.3f}%".format(acc_RFC))

Random Forest's accuracy for classification: 99.538%
