<a href="https://colab.research.google.com/github/BifoldTide/-Core-Calidad-de-vinos/blob/main/(Core)_Calidad_del_vino.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Carga de datos

In [2]:
# Importar bibliotecas
import pandas as pd
import numpy as np

In [3]:
# Importar los datos
path = "/content/drive/MyDrive/BBDD SONDA/WineQT.csv"
df = pd.read_csv(path)

In [4]:
# Leer los datos a ver que estén bien cargados
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,2
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,3
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,4


### Limpieza de datos

In [5]:
# Ver status de las columnas
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1143 entries, 0 to 1142
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1143 non-null   float64
 1   volatile acidity      1143 non-null   float64
 2   citric acid           1143 non-null   float64
 3   residual sugar        1143 non-null   float64
 4   chlorides             1143 non-null   float64
 5   free sulfur dioxide   1143 non-null   float64
 6   total sulfur dioxide  1143 non-null   float64
 7   density               1143 non-null   float64
 8   pH                    1143 non-null   float64
 9   sulphates             1143 non-null   float64
 10  alcohol               1143 non-null   float64
 11  quality               1143 non-null   int64  
 12  Id                    1143 non-null   int64  
dtypes: float64(11), int64(2)
memory usage: 116.2 KB


In [6]:
# Normalizar el nombre de la columna ID
df.rename(columns = {"Id" : "id"}, inplace = True)

In [7]:
# Buscar duplicados
df[df.duplicated()] # No hay duplicados

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,id


In [12]:
# Detectar nulos:
df[df.isnull().any(axis=1)] # No hay nulos

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,id


In [13]:
# Ver métricas
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,id
count,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0
mean,8.311111,0.531339,0.268364,2.532152,0.086933,15.615486,45.914698,0.99673,3.311015,0.657708,10.442111,5.657043,804.969379
std,1.747595,0.179633,0.196686,1.355917,0.047267,10.250486,32.78213,0.001925,0.156664,0.170399,1.082196,0.805824,463.997116
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0,0.0
25%,7.1,0.3925,0.09,1.9,0.07,7.0,21.0,0.99557,3.205,0.55,9.5,5.0,411.0
50%,7.9,0.52,0.25,2.2,0.079,13.0,37.0,0.99668,3.31,0.62,10.2,6.0,794.0
75%,9.1,0.64,0.42,2.6,0.09,21.0,61.0,0.997845,3.4,0.73,11.1,6.0,1209.5
max,15.9,1.58,1.0,15.5,0.611,68.0,289.0,1.00369,4.01,2.0,14.9,8.0,1597.0


In [17]:
# Checkar Outliers
df[df["total sulfur dioxide"] > 150]

# Pasarlos a un valor no tan lejano
df.loc[df["total sulfur dioxide"] > 150, "total sulfur dioxide"] = 150

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,id


In [21]:
df[df["free sulfur dioxide"] > 60]
df.loc[df["free sulfur dioxide"] > 60, "free sulfur dioxide"] = 40

In [23]:
df[df['residual sugar'] > 10]
df.loc[df['residual sugar'] > 10, "residual sugar"] = 10

### Orden de datos

In [70]:
# Importar bibliotecas de modelos
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, median_absolute_error

In [24]:
# Quitar columna ID
df.drop(columns = "id", inplace =True)

In [25]:
# Cargar features y target
X = df.drop(columns = "quality")
y = df["quality"]

In [26]:
# Orden de columnas
col_num = ["fixed acidity", "volatile acidity", "citric acid", "residual sugar", "chlorides", "free sulfur dioxide",
           "total sulfur dioxide", "density", "pH", "sulphates", "alcohol"]

In [29]:
# 2. Dividir en train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=96)

In [55]:
# Transformar datos
preprocessor = ColumnTransformer(transformers=[
    ("num", StandardScaler(), col_num)
])

In [56]:
# Pipelines
pipeline_KNN = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("classifier", KNeighborsRegressor(n_neighbors = 5))
])
pipeline_ranforest = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("classifier", RandomForestRegressor(n_estimators=100, random_state=96))
])
pipeline_line = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("classifier", LinearRegression())
])

In [57]:
# Entrenar
pipeline_KNN.fit(X_train, y_train)
pipeline_ranforest.fit(X_train, y_train)
pipeline_line.fit(X_train, y_train)

In [68]:
# Predecir
y_pred_knn = pipeline_KNN.predict(X_test)
y_pred_ranforest = pipeline_ranforest.predict(X_test)
y_pred_line = pipeline_line.predict(X_test)

In [73]:
# Evaluar
print("Exactitud R2 (knn):", r2_score(y_test, y_pred_knn))
print("Exactitud R2 (random forest):", r2_score(y_test, y_pred_ranforest))
print("Exactitud R2 (lineregressor):", r2_score(y_test, y_pred_line))
print("\n")
print(f"Mse knn: {mean_squared_error(y_test, y_pred_knn)}")
print(f"Mse bosque aleatorio: {mean_squared_error(y_test, y_pred_ranforest)}")
print(f"Mse linea de regresión: {mean_squared_error(y_test, y_pred_line)}")
print("\n")
print(f"Mae knn: {median_absolute_error(y_test,y_pred_knn)}")
print(f"Mae bosque aleatorio: {median_absolute_error(y_test, y_pred_ranforest)}")
print(f"Mae linea de regresión: {median_absolute_error(y_test, y_pred_line)}")

Exactitud R2 (knn): 0.24458299695708863
Exactitud R2 (random forest): 0.43964469353536595
Exactitud R2 (lineregressor): 0.33706560091910975


Mse knn: 0.46393013100436686
Mse bosque aleatorio: 0.34413537117903936
Mse linea de regresión: 0.4071330687210226


Mae knn: 0.40000000000000036
Mae bosque aleatorio: 0.3099999999999996
Mae linea de regresión: 0.3919780124531309


En sintesis, el mejor modelo para este ejercicio es el bosque aleatorio.
Tiene tanto menos errores como mejor presición en el modelo.