In [2]:
import pandas as pd
import numpy as np

In [3]:
red_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', ';')

In [4]:
white_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv', ';')

# Data preparation and analysis

In [5]:
# Having two different data set for Red and White wine produced by a winery company in the same region where 
# the consulting company wishes to establish, ergo the analysis.
# First step is to merge the data producing a new variable or category to determined the type of wine
# "red" or "white".

print(red_wine)

      fixed acidity  volatile acidity  citric acid  ...  sulphates  alcohol  quality
0               7.4             0.700         0.00  ...       0.56      9.4        5
1               7.8             0.880         0.00  ...       0.68      9.8        5
2               7.8             0.760         0.04  ...       0.65      9.8        5
3              11.2             0.280         0.56  ...       0.58      9.8        6
4               7.4             0.700         0.00  ...       0.56      9.4        5
...             ...               ...          ...  ...        ...      ...      ...
1594            6.2             0.600         0.08  ...       0.58     10.5        5
1595            5.9             0.550         0.10  ...       0.76     11.2        6
1596            6.3             0.510         0.13  ...       0.75     11.0        6
1597            5.9             0.645         0.12  ...       0.71     10.2        5
1598            6.0             0.310         0.47  ...       0.6

In [6]:
#create a new variable 'wine_type': "red" and "white" for an easier concatenation in ONE dataframe.

red_wine['wine_type'] = 'red'

In [7]:
white_wine['wine_type'] = 'white'

In [8]:
# In both  datasets there was already collected and provided a categorization on the wines produced. 
# from 1 - 9 where 9 is the highest and 1 the lowest quality.
# For an easier analysis and creation of a prediction model, the quality of wines would be re-organized
# in "low", "medium" and "high" quality labels.

red_wine['quality'].value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

In [9]:
# bucket "red" and "white" wine quality scores into qualitative quality labels

red_wine['quality_label'] = red_wine['quality'].apply(lambda value: 'low'
if value <= 5 else 'medium'
if value <= 7 else 'high')

In [10]:
red_wine['quality_label'] = pd.Categorical(red_wine['quality_label'],
categories=['low', 'medium', 'high'])

In [11]:
white_wine['quality_label'] = white_wine['quality'].apply(lambda value: 'low'
if value <= 5 else 'medium'
if value <= 7 else 'high')

In [12]:
white_wine['quality_label'] = pd.Categorical(white_wine['quality_label'],
categories=['low', 'medium', 'high'])

# we transform the new column from an object to a category in order to tell the machine not to order the data 
# in alphabetic order and let it know that all have "the same value"

In [13]:
#combine the two datasets into one : "wines"

wines = pd.concat([red_wine, white_wine])

In [14]:
# re-shuffle records just to randomize data points
wines = wines.sample(frac=1, random_state=42).reset_index(drop=True)

In [15]:
#Observe if there is missing values
wines.isnull().any()


fixed acidity           False
volatile acidity        False
citric acid             False
residual sugar          False
chlorides               False
free sulfur dioxide     False
total sulfur dioxide    False
density                 False
pH                      False
sulphates               False
alcohol                 False
quality                 False
wine_type               False
quality_label           False
dtype: bool

In [16]:
wines.isnull().sum().sum()

0

In [19]:
#save changes for each dataframe and the new combined one
red_wine.to_csv('red_wine.csv')


In [20]:
white_wine.to_csv("white_wine.csv")

In [None]:
#save new dataframe
wines.to_csv('wines.csv')