In [98]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel, SelectKBest, chi2, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import LinearSVC
import pandas as pd
import numpy as np
from sklearn.utils import resample

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
df = pd.read_csv(url, sep=";")

# Prepare the data
X = df.drop('quality', axis=1)
y = df['quality']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

In [103]:

pipe = Pipeline([
    ('scaling', StandardScaler()),
    ('feature_selection', RFE(estimator=RandomForestClassifier(random_state=42,class_weight='balanced'))),
    ('classification', RandomForestClassifier(random_state=42,class_weight='balanced'))
])

param_grid = {
    'feature_selection__n_features_to_select': [5,6,7,8],
    'classification__n_estimators': [98,99,100,101,102],
    'classification__max_depth': [None],
    'classification__min_samples_split': [4,5,6],
    'classification__min_samples_leaf': [1,2,3,4],
    'classification__bootstrap': [True],
}

grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring='f1_weighted', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Make predictions
y_pred = grid_search.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))


Best parameters: {'classification__bootstrap': True, 'classification__max_depth': None, 'classification__min_samples_leaf': 1, 'classification__min_samples_split': 4, 'classification__n_estimators': 99, 'feature_selection__n_features_to_select': 7}
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           4       1.00      0.09      0.17        11
           5       0.74      0.76      0.75       136
           6       0.65      0.73      0.69       128
           7       0.64      0.53      0.58        40
           8       1.00      0.33      0.50         3

    accuracy                           0.69       320
   macro avg       0.67      0.41      0.45       320
weighted avg       0.70      0.69      0.68       320



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [104]:
feature_select = RFE(estimator=RandomForestClassifier(random_state=42,class_weight='balanced')).fit(X_train,y_train)

importances = feature_select.ranking_
feature_names = df_upsampled.drop(['quality'],axis=1).columns

# Create a DataFrame to display the feature importances
feature_importances = pd.DataFrame({'Predictor': list(feature_names), 'Importance': importances})
feature_importances.sort_values(by='Importance', ascending=False, inplace=True)
print(feature_importances)




               Predictor  Importance
3         residual sugar           7
5    free sulfur dioxide           6
0          fixed acidity           5
8                     pH           4
4              chlorides           3
2            citric acid           2
1       volatile acidity           1
6   total sulfur dioxide           1
7                density           1
9              sulphates           1
10               alcohol           1
