# EDA Wines

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
pd.set_option('max_columns',None)

In [14]:
df_wr = pd.read_csv('winequality-red.csv', sep=';')
df_wr.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [15]:
df_ww= pd.read_csv('winequality-white.csv', sep=';')
df_ww.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [16]:
print("Dimensiones del dataset de vino rojo: ",df_wr.shape)
print("Dimensiones del dataset de vino blanco: ", df_ww.shape)

Dimensiones del dataset de vino rojo:  (1599, 12)
Dimensiones del dataset de vino blanco:  (4898, 12)


| Variables | Descripcciòn |
|-----------|--------------|
|fixed acidity| most acids involved with wine or fixed or nonvolatile (do not evaporate readily).|
|volatile acidity| the amount of acetic acid in wine, which at too high of levels can lead to an unpleasant, vinegar taste.|
|citric acid| found in small quantities, citric acid can add ‘freshness’ and flavor to wines.|
|residual sugar| the amount of sugar remaining after fermentation stops, it’s rare to find wines with less than 1 gram/liter and wines with greater than 45 grams/liter are considered sweet.|
|chlorides| the amount of salt in the wine.|
|free sulfur dioxide| the free form of SO2 exists in equilibrium between molecular SO2 (as a dissolved gas) and bisulfite ion; it prevents microbial growth and the oxidation of wine.|
|total sulfur dioxide| amount of free and bound forms of S02; in low concentrations, SO2 is mostly undetectable in wine, but at free SO2 concentrations over 50 ppm, SO2 becomes evident in the nose and taste of wine.|
|density| the density of water is close to that of water depending on the percent alcohol and sugar content.|
|pH| describes how acidic or basic a wine is on a scale from 0 (very acidic) to 14 (very basic); most wines are between 3-4 on the pH scale.|
|sulphates| a wine additive which can contribute to sulfur dioxide gas (S02) levels, wich acts as an antimicrobial and antioxidant.|
|alcohol| the percent alcohol content of the wine.|

### Vamos a combinar ambos dataset, asi que antes de hacerlos, creamos una nueva columna en cada uno donde se sepa que tipo de vino es: Rojo o Blanco

In [17]:
df_wr['type'] = 'Red'
df_ww['type'] = 'White'

In [19]:
df_ww.head(1)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6,White


In [21]:
df_wr.head(1)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,Red


In [29]:
df_wines = pd.concat([df_ww,df_wr], ignore_index= True)

In [30]:
df_wines.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6,White
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6,White
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6,White
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,White
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,White


In [31]:
df_wines.tail()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
6492,6.2,0.6,0.08,2.0,0.09,32.0,44.0,0.9949,3.45,0.58,10.5,5,Red
6493,5.9,0.55,0.1,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6,Red
6494,6.3,0.51,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,Red
6495,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5,Red
6496,6.0,0.31,0.47,3.6,0.067,18.0,42.0,0.99549,3.39,0.66,11.0,6,Red


In [32]:
df_wines.shape

(6497, 13)

### Ahora contamos con un dataset de 6497 filas y 13 columnas, recordemos que se agregò una columna adicional para diferenciar el tipo de vino: Rojo o blanco.

In [33]:
df_wines.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         6497 non-null   float64
 1   volatile acidity      6497 non-null   float64
 2   citric acid           6497 non-null   float64
 3   residual sugar        6497 non-null   float64
 4   chlorides             6497 non-null   float64
 5   free sulfur dioxide   6497 non-null   float64
 6   total sulfur dioxide  6497 non-null   float64
 7   density               6497 non-null   float64
 8   pH                    6497 non-null   float64
 9   sulphates             6497 non-null   float64
 10  alcohol               6497 non-null   float64
 11  quality               6497 non-null   int64  
 12  type                  6497 non-null   object 
dtypes: float64(11), int64(1), object(1)
memory usage: 660.0+ KB


In [34]:
df_wines['type'] = df_wines['type'].astype('category')

In [35]:
df_wines.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
type                    0
dtype: int64

#### No cuenta con datos nulos

In [37]:
df_wines[df_wines.duplicated()]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
4,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,White
5,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6,White
7,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6,White
8,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6,White
20,6.2,0.660,0.48,1.2,0.029,29.0,75.0,0.98920,3.33,0.39,12.8,8,White
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6461,7.2,0.695,0.13,2.0,0.076,12.0,20.0,0.99546,3.29,0.54,10.1,5,Red
6462,7.2,0.695,0.13,2.0,0.076,12.0,20.0,0.99546,3.29,0.54,10.1,5,Red
6465,7.2,0.695,0.13,2.0,0.076,12.0,20.0,0.99546,3.29,0.54,10.1,5,Red
6479,6.2,0.560,0.09,1.7,0.053,24.0,32.0,0.99402,3.54,0.60,11.3,5,Red


#### Cuenta con 1177 datos duplicados. Procedemos a eliminarlos, manteniendo uno de las filas.

In [41]:
df_wines = df_wines.drop_duplicates(keep='first')

In [42]:
df_wines.shape

(5320, 13)

#### Se observa que se redujo la cantidad de filas, de 6497 a 5320. Los 1177 datos duplicados fueron eliminados.

## Al eliminar las filas duplicadas me surgiò la siguiente pregunta. ¿Existirà un vino rojo y un vino blanco con las mismas caracteristicas?

In [44]:
df_wines.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality', 'type'],
      dtype='object')

In [50]:
df_wines[df_wines.duplicated(subset=['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'])]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
5734,6.7,0.28,0.28,2.4,0.012,36.0,100.0,0.99064,3.26,0.39,11.7,7,Red
6029,5.9,0.19,0.21,1.7,0.045,57.0,135.0,0.99341,3.32,0.44,9.5,5,Red


In [62]:
df_wines[(df_wines['fixed acidity'] == 6.7) & (df_wines['volatile acidity'] == 0.28) & (df_wines['citric acid'] == 0.28) & (df_wines['residual sugar'] == 2.4) & (df_wines['chlorides'] == 0.012) & (df_wines['free sulfur dioxide'] == 36.0) & (df_wines['total sulfur dioxide'] == 100.0) & (df_wines['density'] == 0.99064 ) & (df_wines['pH'] == 3.26) & (df_wines['sulphates'] == 0.39) & (df_wines['alcohol'] == 11.7) & (df_wines['quality'] == 7)]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
2631,6.7,0.28,0.28,2.4,0.012,36.0,100.0,0.99064,3.26,0.39,11.7,7,White
5734,6.7,0.28,0.28,2.4,0.012,36.0,100.0,0.99064,3.26,0.39,11.7,7,Red


In [63]:
df_wines[(df_wines['fixed acidity'] == 5.9) & (df_wines['volatile acidity'] == 0.19) & (df_wines['citric acid'] == 0.21) & (df_wines['residual sugar'] == 1.7) & (df_wines['chlorides'] == 0.045) & (df_wines['free sulfur dioxide'] == 57.0) & (df_wines['total sulfur dioxide'] == 135.0) & (df_wines['density'] == 0.99341 ) & (df_wines['pH'] == 3.32) & (df_wines['sulphates'] == 0.44) & (df_wines['alcohol'] == 9.5) & (df_wines['quality'] == 5)]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
3374,5.9,0.19,0.21,1.7,0.045,57.0,135.0,0.99341,3.32,0.44,9.5,5,White
6029,5.9,0.19,0.21,1.7,0.045,57.0,135.0,0.99341,3.32,0.44,9.5,5,Red


#### Demostramos que si pueden existir tanto vino blanco como vino rojo con las mismas caracteristicas, por consiguiente podriamos pensar que para clasificar un vino segun su tipo puede haber confusion por la ambiguedad.