In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
# Read our file which is in csv format. (Leia nosso arquivo que está no formato csv.)
dataset = pd.read_csv("WineQT.csv")
dataset.head(10)

In [None]:
# Number of rows and columns (Número de linhas e colunas)
dataset.shape

In [None]:
# info (Informações)
dataset.info()

In [None]:
# let's use the table.drop command to remove the Id column of our database, as this information is not useful and does not add anything to our review.
# Vamos usar o comando table.drop para remover a coluna Id do nosso banco de dados, pois essa informação não é útil e não acrescenta nada à nossa revisão.
# dataset = dataset.drop("Id", axis=1)
display(dataset)
# dataset.head(10)

In [None]:
# check the amount of information we have in the “quality” column of the table.
# normalize and format in percentage, so it is easier to analyze which proportion.
# verifica a quantidade de informação que temos na coluna “qualidade” da tabela.
# normaliza e formata em porcentagem, assim fica mais fácil analisar qual proporção.
display(dataset["quality"].value_counts())
display(dataset["quality"].value_counts(normalize=True).map("{:.1%}".format))
display(dataset.groupby("alcohol").mean(numeric_only=True))

In [None]:
# Null values (Valores nulos)
dataset.isnull()

In [None]:
# Vizualizations (Visualização).
# Statistical measures (Medidas estatísticas).
dataset.describe()

In [None]:
display(dataset["quality"].value_counts())
quality_value = dataset["quality"].value_counts(normalize=True).map("{:.1%}".format)
quality_valueDF = pd.DataFrame(quality_value)
quality_valueDF

In [None]:
sns.catplot(x = 'quality', data = dataset , kind = 'count',margin_titles = True)

In [None]:
#volatile acidity vs quality
plot = plt.figure(figsize = (9,8))
sns.barplot(x = "quality", y = "volatile acidity" , data = dataset)

#Hence higher the volatile acidity lower is the quality

In [None]:
#citric acidity vs quality
plot = plt.figure(figsize = (9,8))
sns.barplot(x = "quality", y = "citric acid" , data = dataset)

#Hence higher the citric acidity higher is the quality

In [None]:
#residual sugar vs quality
plot = plt.figure(figsize = (9,8))
sns.barplot(x = "quality", y = "residual sugar" , data = dataset)

In [None]:
#chlorides vs quality
plot = plt.figure(figsize = (9,8))
sns.barplot(x = "quality", y = "chlorides" , data = dataset)

#Hence higher the chlorides lower is the quality

In [None]:
#free sulfur dioxide vs quality
plot = plt.figure(figsize = (9,8))
sns.barplot(x = "quality", y = "free sulfur dioxide" , data = dataset)

In [None]:
#total sulfur dioxide vs quality
plot = plt.figure(figsize = (9,8))
sns.barplot(x = "quality", y = "total sulfur dioxide" , data = dataset)

In [None]:
#density vs quality
plot = plt.figure(figsize = (9,8))
sns.barplot(x = "quality", y = "density" , data = dataset)

In [None]:
#pH vs quality
plot = plt.figure(figsize = (9,8))
sns.barplot(x = "quality", y = "pH" , data = dataset)

In [None]:
#sulphates vs quality
plot = plt.figure(figsize = (9,8))
sns.barplot(x = "quality", y = "sulphates" , data = dataset)

#more higher the sulphates more is the quality

In [None]:
#alcohol vs quality
plot = plt.figure(figsize = (9,8))
sns.barplot(x = "quality", y = "alcohol" , data = dataset)

#more higher the alcohol more is the quality

In [None]:
# Correlation
correlation = dataset.corr()
correlation

In [None]:
#heatmap
plt.figure(figsize = (10,9))
sns.heatmap(correlation,cbar = True,square = True,fmt = ".2f",annot = True,annot_kws={'size':8})

In [None]:
# Separar as colunas de features e a variável alvo
X = dataset.drop(['quality', 'Id'], axis=1)  # Features (retiramos as colunas 'quality' e 'Id')
y = dataset['quality']  # Variável alvo (neste caso, a qualidade do vinho)

# Dividir os dados em conjuntos de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Inicializar e treinar o modelo RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [None]:
# Fazer previsões usando os dados de teste
y_pred = model.predict(X_test)

# Calcular a acurácia do modelo
accuracy = accuracy_score(y_test, y_pred)
print(f'Acurácia do modelo: {accuracy}')

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Carregar o conjunto de dados Iris
data = load_iris()
X = data.data
y = data.target

# Dividir os dados em conjuntos de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Inicializar o modelo KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=3)  # Número de vizinhos é definido como 3 neste exemplo

# Treinar o modelo com os dados de treinamento
model.fit(X_train, y_train)

# Fazer previsões usando os dados de teste
y_pred = model.predict(X_test)

# Calcular a acurácia do modelo
accuracy = accuracy_score(y_test, y_pred)
print(f'Acurácia do modelo KNN para classificação: {accuracy}')

In [None]:
from sklearn.model_selection import cross_val_score

# Inicializar o modelo
model = RandomForestClassifier()

# Realizar a validação cruzada e obter as acurácias em cada fold
accuracies = cross_val_score(model, X, y, cv=5)

# Calcular a média e o desvio padrão das acurácias
mean_accuracy = accuracies.mean()
std_accuracy = accuracies.std()

print(f'Acurácia média da validação cruzada: {mean_accuracy}')
print(f'Desvio padrão da acurácia da validação cruzada: {std_accuracy}')

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_predict

# Obter as previsões do modelo usando validação cruzada
y_pred_cv = cross_val_predict(model, X, y, cv=5)

# Calcular as métricas
precision = precision_score(y, y_pred_cv, average='weighted')
recall = recall_score(y, y_pred_cv, average='weighted')
f1 = f1_score(y, y_pred_cv, average='weighted')

print(f'Precisão da validação cruzada: {precision}')
print(f'Recall da validação cruzada: {recall}')
print(f'F1-score da validação cruzada: {f1}')

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Calcular a acurácia do modelo
accuracy = accuracy_score(y_test, y_pred)

# Calcular precisão, recall e F1-score
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Calcular a matriz de confusão
confusion_mat = confusion_matrix(y_test, y_pred)

print(f'Acurácia: {accuracy}')
print(f'Precisão: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1}')
print('Matriz de Confusão:')
print(confusion_mat)