# Predicting Wine Quality Score

**Authors**: Xander Dawson, Zackarya Hamza, Sid Ahuja

In [7]:
library(tidyverse, quietly = TRUE)
library(tidymodels, quietly = TRUE)
# add any used packages here and delete this line before submitting report

## Summary

blank

## Introduction

blank

## Methods and Results

### Data

Discuss Data Here

### Analysis

Discuss analysis methods here

In [2]:
# data downloaded as zip from UCI Machine Learning Repository and unzipped file is moved to /data
url <- "https://archive.ics.uci.edu/static/public/186/wine+quality.zip"
temp <- tempfile()
download.file(url, temp)
unzip(temp, exdir = "./data/raw")

In [3]:
# preprocessing data
# read csv file and replace column header
white_wine <- read_delim("data/raw/winequality-white.csv", delim = ";", show_col_types = FALSE)
colnames(white_wine) <- c("fixed_acidity", "volatile_acidity", "citric_acid", "residual_sugar", 
                          "chlorides", "free_sulfur_dioxide", "total_sulfur_dioxide", "density", 
                          "pH", "sulphates", "alcohol", "quality_score")

# changing quality_score to quality_category
white_wine <- white_wine %>% 
    mutate(quality_category = case_when(
        quality_score <= 3 ~ "poor",
        quality_score <= 7 ~ "average",
        quality_score <= 10 ~ "excellent")) %>%
    select(-quality_score)

head(white_wine)

fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality_category
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
7.0,0.27,0.36,20.7,0.045,45,170,1.001,3.0,0.45,8.8,average
6.3,0.3,0.34,1.6,0.049,14,132,0.994,3.3,0.49,9.5,average
8.1,0.28,0.4,6.9,0.05,30,97,0.9951,3.26,0.44,10.1,average
7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,average
7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,average
8.1,0.28,0.4,6.9,0.05,30,97,0.9951,3.26,0.44,10.1,average


In [4]:
# creating the train, test and cross-validation splits
set.seed(123)
wine_split <- initial_split(white_wine, prop = 0.70, strata = quality_category)
wine_train <- training(wine_split)
wine_test <- testing(wine_split)

write_csv(wine_train, "data/processed/white_wine_train.csv")
write_csv(wine_test, "data/processed/white_wine_test.csv")

wine_vfold <- vfold_cv(wine_train, v = 5, strata = quality_category)

In [5]:
# feature scaling recipe
wine_preprocess_recipe <- recipe(quality_category ~ ., data = wine_train) %>%
    step_scale(all_predictors()) %>%
    step_center(all_predictors())

prep_preprocess_recipe <- prep(wine_preprocess_recipe, training = wine_train)

scaled_wine_train <- bake(prep_preprocess_recipe, new_data = wine_train)
scaled_wine_test <- bake(prep_preprocess_recipe, new_data = wine_test)

write_csv(scaled_wine_train, "data/processed/scaled_white_wine_train.csv")
write_csv(scaled_wine_test, "data/processed/scaled_white_wine_test.csv")

# note for model training

In [6]:
# note for model training: use wine_preprocess_recipe (don't use prep_preprocess_recipe)

## Discussion

blank

## References

blank