In [24]:
import pandas as pd # para manipulação e limpeza dos dados
import numpy as np  # operações matemáticas e com vetores

from datetime import datetime # módulo para uso de ferramentas de data e hora

import matplotlib.pyplot as plt # gráficos
import seaborn as sns           # gráficos e ferramentas de estatísticas
sns.set_style('whitegrid')

# configurações padrão para os gráficos


plt.rc('figure', figsize=(10,8))
plt.style.use('ggplot')

# Plotly umas das bibliotecas mais incríveis para dataviz ! Irei usá-la neste caso devido a escala dos valores do dataset (na casa de centenas
# de milhões, essa biblioteca torna a leitura desses valores muito mais clara) 

from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
import cufflinks
cufflinks.go_offline()

# Setups
pd.set_option('display.float_format', lambda x: '%.3f' % x)
sns.set(style='white', context='notebook', palette='deep')
import warnings
warnings.filterwarnings('ignore')
sns.set_style('white')
%matplotlib inline


ModuleNotFoundError: No module named 'cufflinks'

In [4]:
# Lendo os conjuntos de dados
red_wine = pd.read_csv("winequality-red.csv", sep=";")

white_wine = pd.read_csv("winequality-white.csv", sep=";")

In [5]:
red_wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [6]:
white_wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [12]:
# Ajustando as colunas

white_wine.columns = [col.replace(" ", "_") for col in white_wine.columns]
red_wine.columns = [col.replace(" ", "_") for col in red_wine.columns]

In [13]:
#criando as colunas de cor

color_red = np.repeat('Red', len(red_wine))
color_white = np.repeat("White", len(white_wine))

white_wine['color'] = color_white
white_wine.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,color
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6,White
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6,White
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.995,3.26,0.44,10.1,6,White
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.996,3.19,0.4,9.9,6,White
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.996,3.19,0.4,9.9,6,White


In [14]:
red_wine['color'] = color_red
red_wine.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,color
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.998,3.51,0.56,9.4,5,Red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.997,3.2,0.68,9.8,5,Red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,Red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,Red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.998,3.51,0.56,9.4,5,Red


In [15]:
df_wine = red_wine.append(white_wine, ignore_index=True)

df_wine.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,color
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.998,3.51,0.56,9.4,5,Red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.997,3.2,0.68,9.8,5,Red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,Red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,Red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.998,3.51,0.56,9.4,5,Red


In [16]:
df_wine.tail()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,color
6492,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.991,3.27,0.5,11.2,6,White
6493,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.995,3.15,0.46,9.6,5,White
6494,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.993,2.99,0.46,9.4,6,White
6495,5.5,0.29,0.3,1.1,0.022,20.0,110.0,0.989,3.34,0.38,12.8,7,White
6496,6.0,0.21,0.38,0.8,0.02,22.0,98.0,0.989,3.26,0.32,11.8,6,White


In [23]:
# Salvando o novo conjunto de dados

df_wine_complete = df_wine
df_wine_complete.to_csv("df_wine_complete")