# importando bibliotecas

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import scipy

# carregando dados

In [3]:
#Lendo os dados com pandas 
dados = pd.read_csv("winequality-red.csv")

#exibindo os cinco primeiros do dataset
dados.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
# Adicionando nome ID na primeira coluna 
dados.rename_axis("ID", axis = 'columns', inplace = True)
dados.head()

ID,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


# Distribuições de frequência com variáveis quantitativas

In [6]:
#variável quantitativa do pH
dados.pH.min()

2.74

In [7]:
dados.pH.max()

4.01

In [8]:
classes = [0, 3.3, 3.6, 6.9, 7.1, 14]
labels = ['Ácido Azedo', 'Ácido Ideal', 'Ácido' , 'Neutro', 'Alcalino']

pd.cut(x = dados.pH,bins = classes,labels = labels, include_lowest = True )

0       Ácido Ideal
1       Ácido Azedo
2       Ácido Azedo
3       Ácido Azedo
4       Ácido Ideal
           ...     
1594    Ácido Ideal
1595    Ácido Ideal
1596    Ácido Ideal
1597    Ácido Ideal
1598    Ácido Ideal
Name: pH, Length: 1599, dtype: category
Categories (5, object): ['Ácido Azedo' < 'Ácido Ideal' < 'Ácido' < 'Neutro' < 'Alcalino']

In [9]:
frequencia = pd.value_counts(
    pd.cut(
        x = dados.pH,
        bins = classes,
        labels = labels,
        include_lowest = True
    )
)

percentual = pd.value_counts(
    pd.cut(
        x = dados.pH,
        bins = classes,
        labels = labels,
        include_lowest = True
    ), normalize = True
) * 100

dist_freq_pH = pd.DataFrame(
    {'Frequência': frequencia, 'Porcentagem (%)': percentual}
)

dist_freq_pH.rename_axis('pH', axis= 'columns', inplace = True)

dist_freq_pH.sort_index(ascending = True, inplace = True)

dist_freq_pH

pH,Frequência,Porcentagem (%)
Ácido Azedo,783,48.968105
Ácido Ideal,768,48.030019
Ácido,48,3.001876
Neutro,0,0.0
Alcalino,0,0.0


In [10]:
#número de registros e a quantidade de variáveis
n = dados.shape
n

(1599, 12)

In [11]:
#para pegar só o número de registros
n = dados.shape[0]
n

1599

In [12]:
#descobrir o número de classes de amplitude fixa por meio da regra de Sturges

k = 1 + (10 /3) * np.log10(n) 
k 

11.679494879154115

In [13]:
#arredondando  a variável k
k = k.round(0)
k

12.0

In [14]:
k = int(k.round(0))
k

12

In [15]:
#utilizando value_counts() para distribuição de frequencia
pd.value_counts(
    pd.cut(
        x = dados.density,
        bins = 12,
        include_lowest = True
    )
)

(0.996, 0.997]    419
(0.997, 0.998]    397
(0.995, 0.996]    272
(0.998, 0.999]    182
(0.999, 1.0]      108
(0.993, 0.995]    105
(0.992, 0.993]     45
(1.0, 1.001]       32
(0.991, 0.992]     13
(1.001, 1.003]      9
(1.003, 1.004]      9
(0.989, 0.991]      8
Name: density, dtype: int64

In [16]:
#para ordenar do menor ao maior registro e adicionar os atributos frequencia e percentual para gerar novo DataFrame
frequencia = pd.value_counts(
    pd.cut(
        x = dados.density,
        bins = 12,
        include_lowest = True
    ),
    sort = False
)

percentual = pd.value_counts(
    pd.cut(
        x = dados.density,
        bins = 12,
        include_lowest = True
    ),
    sort = False,
    normalize = True
)


dist_freq_quantitativas_amplitude_fixa = pd.DataFrame(
    {'Frequência': frequencia, 'Porcentagem (%)': percentual}
)
dist_freq_quantitativas_amplitude_fixa

Unnamed: 0,Frequência,Porcentagem (%)
"(0.989, 0.991]",8,0.005003
"(0.991, 0.992]",13,0.00813
"(0.992, 0.993]",45,0.028143
"(0.993, 0.995]",105,0.065666
"(0.995, 0.996]",272,0.170106
"(0.996, 0.997]",419,0.262039
"(0.997, 0.998]",397,0.24828
"(0.998, 0.999]",182,0.113821
"(0.999, 1.0]",108,0.067542
"(1.0, 1.001]",32,0.020013


In [17]:
dist_freq_quantitativas_amplitude_fixa.rename_axis('density', axis= 'columns', inplace = True)
dist_freq_quantitativas_amplitude_fixa

density,Frequência,Porcentagem (%)
"(0.989, 0.991]",8,0.005003
"(0.991, 0.992]",13,0.00813
"(0.992, 0.993]",45,0.028143
"(0.993, 0.995]",105,0.065666
"(0.995, 0.996]",272,0.170106
"(0.996, 0.997]",419,0.262039
"(0.997, 0.998]",397,0.24828
"(0.998, 0.999]",182,0.113821
"(0.999, 1.0]",108,0.067542
"(1.0, 1.001]",32,0.020013


# Medidas de tendencia central

In [19]:
#cálculo da média
dados.alcohol.mean()

10.422983114446502

In [20]:
dados.density.mean()

0.9967466791744831

In [21]:
dados['volatile acidity'].mean()

0.5278205128205131

In [22]:
dados['citric acid'].mean()

0.2709756097560964

In [23]:
#calculo mediana
dados.alcohol.median()

10.2

In [24]:
dados.density.median()

0.99675

In [25]:
dados['volatile acidity'].median()

0.52

In [26]:
dados['citric acid'].median()

0.26

In [27]:
n = dados.shape[1]
n

12

In [28]:
# cálculo da moda
dados.alcohol.mode()

0    9.5
Name: alcohol, dtype: float64

In [29]:
dados.density.mode()

0    0.9972
Name: density, dtype: float64