## Cmput 466 project


In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

split_ratio = 0.4

attribute = ["fixed acidity","volatile acidity","citric acid","residual sugar","chlorides","free sulfur dioxide","total sulfur dioxide","density","pH","sulphates","alcohol"]
point = 'quality'

def getData(wine_type):
    if wine_type == "red":
        dataset = pd.read_csv("winequality-red.csv")
        dataset = dataset.sample(frac=1)
        
        X_all = dataset.drop("quality", axis=1)
        y_all = dataset['quality']
         
        X_train, X_temp, y_train, y_temp = train_test_split(X_all, y_all, test_size=0.3, random_state=42)
        X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    elif wine_type == "white":
        dataset = pd.read_csv("winequality-white.csv")
        dataset = dataset.sample(frac=1)
        X_all = dataset[attribute].values.reshape(-1, len(attritube))
        y_all = dataset[point].values

    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X_all, y_all, test_size=0.2, random_state=42
    )
    
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=0.25, random_state=42
    )
    return X_train, X_val, X_test, y_train, y_val, y_test


def accuracy(pred, y_test):
    correct = sum(1 for p, y in zip(pred, y_test) if p == y)
    acc = correct / len(pred)
    return acc

def close_accuracy(pred, y_test):
    close = sum(1 for p, y in zip(pred, y_test) if p == y or p + 1 == y or p - 1 == y)
    acc = close / len(pred)
    return acc

## Linear regression

In [2]:
import numpy as np
import math
import utils
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from matplotlib import pyplot as plt

def train_model(X_train, y_train):
    LR = LinearRegression()
    LR.fit(X_train, y_train)
    return LR

def evaluate_model(model, X, y, dataset_name):
    pred = model.predict(X)
    pred_rounded = np.rint(pred)
    y_rounded = np.rint(np.array(y))

    mse = mean_squared_error(y_rounded, pred_rounded)
    mae = mean_absolute_error(y_rounded, pred_rounded)
    accuracy = utils.accuracy(pred_rounded, y_rounded)

    print(f"{dataset_name} Evaluation:")
    print(f"Mean squared error: {mse}")
    print(f"Mean absolute error: {mae}")
    print(f"Accuracy: {accuracy}")
    if dataset_name == "Testing":
        print("Close Accuracy (+-1 score): " + str(utils.close_accuracy(pred_rounded, y_rounded)))

def main():
    X_train, X_val, X_test, y_train, y_val, y_test = utils.getData("red")
    
    # Training
    model = train_model(X_train, y_train)
    pred_train = model.predict(X_train)
    pred_train_rounded = np.rint(pred_train)
    y_train_rounded = np.rint(np.array(y_train))
    
    print("Training:")
    evaluate_model(model, X_train, y_train, "Training")

    # Validation
    best_err = float('inf')
    best_model = None
    num_batches = 10  # Replace with your actual number of batches
    batch_size = len(X_val) // num_batches

    for i in range(num_batches):
        X_batch = X_val[i * batch_size:(i + 1) * batch_size]
        y_batch = y_val[i * batch_size:(i + 1) * batch_size]

        model = train_model(X_batch, y_batch)
        pred_val = model.predict(X_batch)
        pred_val_rounded = np.rint(pred_val)
        y_batch_rounded = np.rint(np.array(y_batch))

        err = mean_squared_error(y_batch_rounded, pred_val_rounded)
        if err < best_err:
            best_err = err
            best_model = model

    # Testing
    print("\nTesting:")
    evaluate_model(best_model, X_test, y_test, "Testing")

if __name__ == "__main__":
    main()

Training:
Training Evaluation:
Mean squared error: 0.5109489051094891
Mean absolute error: 0.44421272158498437
Accuracy: 0.5881126173096975

Testing:
Testing Evaluation:
Mean squared error: 0.690625
Mean absolute error: 0.528125
Accuracy: 0.5375
Close Accuracy (+-1 score): 0.946875


## logistic regression

In [3]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score
import utils

def train_logistic_regression(X, y):
    logreg = LogisticRegression(max_iter=10000)  # Increase max_iter to avoid convergence warning
    logreg.fit(X, y)
    return logreg

def evaluate_logistic_regression(model, X, y, dataset_name):
    pred_prob = model.predict_proba(X)[:, 1]  # Probability of the positive class
    pred_binary = np.round(pred_prob)
    
    mse = mean_squared_error(y, pred_binary)
    mae = mean_absolute_error(y, pred_binary)
    accuracy = accuracy_score(y, pred_binary)

    print(f"{dataset_name} Evaluation:")
    print(f"Mean squared error: {mse}")
    print(f"Mean absolute error: {mae}")
    print(f"Accuracy: {accuracy}")

def main():
    X_train, X_val, X_test, y_train, y_val, y_test = utils.getData("red")
    
    # Training Logistic Regression
    logreg_model = train_logistic_regression(X_train, y_train)
    pred_train_prob = logreg_model.predict_proba(X_train)[:, 1]  # Probability of the positive class
    pred_train_binary = np.round(pred_train_prob)
    
    print("Training:")
    evaluate_logistic_regression(logreg_model, X_train, y_train, "Training")

    # Validation
    best_err = float('inf')
    best_logreg_model = None
    num_batches = 10  # Replace with your actual number of batches
    batch_size = len(X_val) // num_batches

    for i in range(num_batches):
        X_batch = X_val[i * batch_size:(i + 1) * batch_size]
        y_batch = y_val[i * batch_size:(i + 1) * batch_size]

        logreg_model = train_logistic_regression(X_batch, y_batch)
        pred_val_prob = logreg_model.predict_proba(X_batch)[:, 1]  # Probability of the positive class
        pred_val_binary = np.round(pred_val_prob)

        err = mean_squared_error(y_batch, pred_val_binary)
        if err < best_err:
            best_err = err
            best_logreg_model = logreg_model

    # Testing
    print("\nTesting:")
    evaluate_logistic_regression(best_logreg_model, X_test, y_test, "Testing")

if __name__ == "__main__":
    main()
    

Training:
Training Evaluation:
Mean squared error: 32.24504692387904
Mean absolute error: 5.6193952033368095
Accuracy: 0.0

Testing:
Testing Evaluation:
Mean squared error: 29.378125
Mean absolute error: 5.321875
Accuracy: 0.0


## Polynomial Regression

In [4]:
import numpy as np
import utils
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

def main():

    # Hyperparameters
    degree = 2  
    
    # training
    X_train, X_val, X_test, y_train, y_val, y_test = utils.getData("red")

    # Polynomial features
    poly = PolynomialFeatures(degree=degree)
    X_train_poly = poly.fit_transform(X_train)
    X_val_poly = poly.transform(X_val)
    X_test_poly = poly.transform(X_test)

    # Linear regression with polynomial features
    poly_reg = LinearRegression()
    poly_reg.fit(X_train_poly, y_train)

    # Training evaluation
    pred_train = poly_reg.predict(X_train_poly)
    pred_train = np.rint(pred_train)
    y_train = np.rint(np.array(y_train))
    print("Training:")
    print("Mean squared error: " + str(mean_squared_error(y_train, pred_train)))
    print("Mean absolute error: " + str(mean_absolute_error(y_train, pred_train)))
    print("Accuracy: " + str(utils.accuracy(pred_train, y_train)))

    # Validation evaluation
    pred_val = poly_reg.predict(X_val_poly)
    pred_val = np.rint(pred_val)
    y_val_rounded = np.rint(np.array(y_val))
    print("\nValidation:")
    print("Mean squared error: " + str(mean_squared_error(y_val_rounded, pred_val)))
    print("Mean absolute error: " + str(mean_absolute_error(y_val_rounded, pred_val)))
    print("Accuracy: " + str(utils.accuracy(pred_val, y_val)))

    # Testing evaluation
    pred_test = poly_reg.predict(X_test_poly)
    pred_test = np.rint(pred_test)
    y_test_rounded = np.rint(np.array(y_test))
    print("\nTesting:")
    print("Mean squared error: " + str(mean_squared_error(y_test_rounded, pred_test)))
    print("Mean absolute error: " + str(mean_absolute_error(y_test_rounded, pred_test)))
    print("Accuracy: " + str(utils.accuracy(pred_test, y_test)))
    print("Close Accuracy (+-1 score): " + str(utils.close_accuracy(pred_test, y_test)))

if __name__ == "__main__":
    main()

Training:
Mean squared error: 0.42752867570385816
Mean absolute error: 0.3920750782064651
Accuracy: 0.6256517205422315

Validation:
Mean squared error: 0.690625
Mean absolute error: 0.453125
Accuracy: 0.615625

Testing:
Mean squared error: 0.621875
Mean absolute error: 0.478125
Accuracy: 0.58125
Close Accuracy (+-1 score): 0.953125
