In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
# Download the dataset from the repository
dataset = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv")
# Save the orignal file on local drive
dataset.to_csv("../data/wine-quality-original.csv", index=False)

# Display the first 5 rows of the data table
dataset.head()

Unnamed: 0,"fixed acidity;""volatile acidity"";""citric acid"";""residual sugar"";""chlorides"";""free sulfur dioxide"";""total sulfur dioxide"";""density"";""pH"";""sulphates"";""alcohol"";""quality"""
0,7;0.27;0.36;20.7;0.045;45;170;1.001;3;0.45;8.8;6
1,6.3;0.3;0.34;1.6;0.049;14;132;0.994;3.3;0.49;9...
2,8.1;0.28;0.4;6.9;0.05;30;97;0.9951;3.26;0.44;1...
3,7.2;0.23;0.32;8.5;0.058;47;186;0.9956;3.19;0.4...
4,7.2;0.23;0.32;8.5;0.058;47;186;0.9956;3.19;0.4...


In [3]:
# split columns from a string to a list
columns = dataset.columns[0].replace('"', "").split(";")
columns

['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'quality']

In [4]:
# converting values from strings to a table of data
values = np.array([np.fromstring(i, sep=';') for i in dataset[dataset.columns[0]]])
values[:2]

array([[7.000e+00, 2.700e-01, 3.600e-01, 2.070e+01, 4.500e-02, 4.500e+01,
        1.700e+02, 1.001e+00, 3.000e+00, 4.500e-01, 8.800e+00, 6.000e+00],
       [6.300e+00, 3.000e-01, 3.400e-01, 1.600e+00, 4.900e-02, 1.400e+01,
        1.320e+02, 9.940e-01, 3.300e+00, 4.900e-01, 9.500e+00, 6.000e+00]])

In [5]:
# concatenating the columns and the table of values into a dataframe
wine_dataset = pd.DataFrame(values, columns=columns)
wine_dataset.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6.0
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6.0
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6.0
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6.0
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6.0


In [6]:
# Splitting 15% of the table as test set to be used to evaluate the model.
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(wine_dataset.drop(['quality'], axis=1), wine_dataset.quality, test_size=0.15, stratify=wine_dataset.quality)
x_train['quality'] = y_train
y_test.value_counts()

6.0    330
5.0    219
7.0    132
8.0     26
4.0     24
3.0      3
9.0      1
Name: quality, dtype: int64

In [7]:
# Save the files
x_train.reset_index(drop=True).to_csv("../data/wine-model-training-data.csv", index=False)
x_test.reset_index(drop=True).to_csv("../data/wine-testing-data.csv", index=False)
y_test.reset_index(drop=True).to_csv("../data/wine-quality.csv", index=False)