In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_predict, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, confusion_matrix, ConfusionMatrixDisplay
from sklearn.compose import TransformedTargetRegressor


In [19]:
#data inladen
df = pd.read_csv("/Users/chielarkink/Documents/GitHub/MADS-HAN/Dataset/wine_data_white.csv", sep=";")

#data opschonen -gpt
# Compacte fix: maak alles numeriek (spaties eruit, komma â†’ punt) en drop NaN
df = df.apply(lambda s: pd.to_numeric(s.astype(str).str.replace(" ", "", regex=False).str.replace(",", ".", regex=False), errors="coerce")).dropna()
df["quality"] = df["quality"].astype(int)  # target weer integer
X = df.drop("quality", axis=1)
y = df["quality"]

#data splitsen voor testen
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [22]:
#classifier aanmaken
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("MAE:", mae)

MAE: 0.499486125385406


In [None]:
# scaler toepassen op de data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
#pipeline instellen
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=1))
])
pipe.fit(X_train, y_train)
y_pred_pipe = pipe.predict(X_test)


In [None]:
#validatie instellen
y_pred_cv = cross_val_predict(pipe, X, y, cv=5)
mae_cv = mean_absolute_error(y, y_pred_cv)
print(mae_cv)

cm = confusion_matrix(y, y_pred_cv)


In [None]:
# pipeline toepassen
pipe_reg = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor(n_neighbors=1))
])

In [None]:
# validatie
y_pred_reg = cross_val_predict(pipe_reg, X, y, cv=5)
mae = mean_absolute_error(y, y_pred_reg)
rmse = np.sqrt(mean_squared_error(y, y_pred_reg))

y_pred_round = np.round(y_pred_reg)
cm_reg = confusion_matrix(y, y_pred_round)

In [18]:
def round_and_clip(y):
    return np.clip(np.round(y), 0, 10)

reg_pipe = TransformedTargetRegressor(
    regressor=KNeighborsRegressor(n_neighbors=1),
    func=round_and_clip, inverse_func=round_and_clip
)

param_grid = {'knn__n_neighbors': [1, 5, 10, 15, 20]}
grid = GridSearchCV(pipe_reg, param_grid, cv=5, scoring='neg_mean_absolute_error')
grid.fit(X, y)
print(grid.best_params_, -grid.best_score_)

0.499486125385406
0.6776315789473685
{'knn__n_neighbors': 20} 0.5898315527472202
