In [111]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel, SelectKBest, chi2, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import LinearSVC
import pandas as pd
import numpy as np
from sklearn.utils import resample
from scipy.stats import zscore

# Loading the dataset from UCI website
df = pd.read_csv('winequality-red.csv', header = 0, delimiter=';')

# Splitting data by class
df_3 = df[df.quality ==3]
df_4 = df[df.quality ==4]
df_5 = df[df.quality ==5]
df_6 = df[df.quality ==6]
df_7 = df[df.quality ==7]
df_8 = df[df.quality ==8]

def remove_outliers(df):
    # Calculates mean and standard deviation of the input df
    mean = df.mean()
    std = df.std()

    # creates a new dataframe for rows that are within their corresponding standard deviation
    new_df = df[(np.abs(df - mean) <= 3*std).all(axis=1)]

    # Return the dataframe with outliers removed
    return new_df

# Applying the function to each dataframe
df_3_new = remove_outliers(df_3)
df_4_new = remove_outliers(df_4)
df_5_new = remove_outliers(df_5)
df_6_new = remove_outliers(df_6)
df_7_new = remove_outliers(df_7)
df_8_new = remove_outliers(df_8)

# Concatenating all new dataframes by row
df_new = pd.concat([df_3_new,df_4_new,df_5_new,df_6_new,df_7_new,df_8_new],axis=0)

# Prepare the data
X = df_new.drop('quality', axis=1)
y = df_new['quality']

# Spltting the data set 80/20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

In [112]:
df_new.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1452.0,1452.0,1452.0,1452.0,1452.0,1452.0,1452.0,1452.0,1452.0,1452.0,1452.0,1452.0
mean,8.304821,0.525475,0.26407,2.388464,0.081448,15.15427,44.151515,0.996711,3.316756,0.641839,10.414153,5.639807
std,1.641415,0.172812,0.190776,0.869068,0.020704,9.323774,30.1217,0.001735,0.141002,0.129084,1.016209,0.814505
min,4.9,0.12,0.0,1.2,0.012,1.0,6.0,0.99064,2.88,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07075,7.0,21.0,0.9956,3.22,0.55,9.5,5.0
50%,7.9,0.52,0.25,2.2,0.079,13.0,36.0,0.9967,3.31,0.62,10.2,6.0
75%,9.2,0.635,0.42,2.6,0.089,21.0,59.0,0.9978,3.4,0.72,11.075,6.0
max,13.7,1.58,0.76,6.7,0.267,48.0,155.0,1.0026,3.78,1.13,14.0,8.0


In [86]:
from sklearn.metrics import classification_report

pipe = Pipeline([
    ('scaling', StandardScaler()),
    ('feature_selection', RFE(estimator=RandomForestClassifier(random_state=42,class_weight='balanced'))),
    ('classification', RandomForestClassifier(random_state=42,class_weight='balanced'))
])

param_grid = {
    'feature_selection__n_features_to_select': [7,8,9,10],
    'classification__n_estimators': [98,99,100,101,102],
    'classification__max_depth': [None],
    'classification__min_samples_split': [4,5,6],
    'classification__min_samples_leaf': [1,2,3,4],
    'classification__bootstrap': [True],
}

grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring='f1_weighted', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Make predictions
y_pred = grid_search.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))


Best parameters: {'classification__bootstrap': True, 'classification__max_depth': None, 'classification__min_samples_leaf': 1, 'classification__min_samples_split': 4, 'classification__n_estimators': 101, 'feature_selection__n_features_to_select': 8}
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           4       0.50      0.10      0.17        10
           5       0.75      0.79      0.77       123
           6       0.71      0.76      0.73       116
           7       0.68      0.64      0.66        36
           8       0.00      0.00      0.00         4

    accuracy                           0.72       291
   macro avg       0.44      0.38      0.39       291
weighted avg       0.70      0.72      0.70       291



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [87]:
feature_select = RFE(estimator=RandomForestClassifier(random_state=42,bootstrap=False,class_weight='balanced')).fit(X_train,y_train)

importances = feature_select.ranking_
feature_names = df.drop(['quality'],axis=1).columns

# Create a DataFrame to display the feature importances
feature_importances = pd.DataFrame({'Predictor': list(feature_names), 'Importance': importances})
feature_importances.sort_values(by='Importance', ascending=True, inplace=True)
print(feature_importances)




               Predictor  Importance
1       volatile acidity           1
6   total sulfur dioxide           1
7                density           1
9              sulphates           1
10               alcohol           1
4              chlorides           2
2            citric acid           3
8                     pH           4
5    free sulfur dioxide           5
0          fixed acidity           6
3         residual sugar           7


In [92]:
from sklearn.metrics import classification_report

pipe = Pipeline([
    ('scaling', MinMaxScaler()),
    ('feature_selection', RFE(estimator=RandomForestClassifier(random_state=42,class_weight='balanced'))),
    ('classification', RandomForestClassifier(random_state=42,class_weight='balanced'))
])

param_grid = {
    'feature_selection__n_features_to_select': [7,8,9,10],
    'classification__n_estimators': [98,99,100,101,102],
    'classification__max_depth': [None],
    'classification__min_samples_split': [4,5,6],
    'classification__min_samples_leaf': [1,2,3,4],
    'classification__bootstrap': [True],
}

grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring='f1_weighted', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Make predictions
y_pred = grid_search.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))


Best parameters: {'classification__bootstrap': True, 'classification__max_depth': None, 'classification__min_samples_leaf': 1, 'classification__min_samples_split': 5, 'classification__n_estimators': 99, 'feature_selection__n_features_to_select': 7}
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           4       0.00      0.00      0.00        10
           5       0.75      0.76      0.75       123
           6       0.69      0.76      0.72       116
           7       0.69      0.67      0.68        36
           8       0.00      0.00      0.00         4

    accuracy                           0.70       291
   macro avg       0.35      0.36      0.36       291
weighted avg       0.68      0.70      0.69       291



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [93]:
from sklearn.metrics import classification_report

pipe = Pipeline([
    ('scaling', StandardScaler()),
    ('feature_selection', RFE(estimator=RandomForestClassifier(random_state=42,class_weight='balanced'))),
    ('classification', RandomForestClassifier(random_state=42,class_weight='balanced'))
])

param_grid = {
    'feature_selection__n_features_to_select': [7,8,9,10],
    'classification__n_estimators': [98,99,100,101,102],
    'classification__max_depth': [None],
    'classification__min_samples_split': [4,5,6],
    'classification__min_samples_leaf': [1,2,3,4],
    'classification__bootstrap': [True],
    'classification__max_samples': [None,0.5,0.7,0.9,1]
}

grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring='f1_weighted', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Make predictions
y_pred = grid_search.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))


Best parameters: {'classification__bootstrap': True, 'classification__max_depth': None, 'classification__max_samples': 0.9, 'classification__min_samples_leaf': 2, 'classification__min_samples_split': 4, 'classification__n_estimators': 102, 'feature_selection__n_features_to_select': 10}
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           4       0.00      0.00      0.00        10
           5       0.73      0.77      0.75       123
           6       0.68      0.70      0.69       116
           7       0.68      0.69      0.68        36
           8       0.00      0.00      0.00         4

    accuracy                           0.69       291
   macro avg       0.35      0.36      0.35       291
weighted avg       0.66      0.69      0.68       291



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [94]:
from sklearn.metrics import classification_report

pipe = Pipeline([
    ('scaling', StandardScaler()),
    ('feature_selection', RFE(estimator=RandomForestClassifier(random_state=42,class_weight='balanced'))),
    ('classification', RandomForestClassifier(random_state=42,class_weight='balanced'))
])

param_grid = {
    'feature_selection__n_features_to_select': [7,8,9,10],
    'classification__n_estimators': [98,99,100,101,102],
    'classification__max_depth': [None],
    'classification__min_samples_split': [4,5,6],
    'classification__min_samples_leaf': [1,2,3,4],
    'classification__bootstrap': [True],
    'classification__max_samples': [None,0.9,1]
}

grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring='f1_weighted', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Make predictions
y_pred = grid_search.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))


Best parameters: {'classification__bootstrap': True, 'classification__max_depth': None, 'classification__max_samples': 0.9, 'classification__min_samples_leaf': 2, 'classification__min_samples_split': 4, 'classification__n_estimators': 102, 'feature_selection__n_features_to_select': 10}
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           4       0.00      0.00      0.00        10
           5       0.73      0.77      0.75       123
           6       0.68      0.70      0.69       116
           7       0.68      0.69      0.68        36
           8       0.00      0.00      0.00         4

    accuracy                           0.69       291
   macro avg       0.35      0.36      0.35       291
weighted avg       0.66      0.69      0.68       291



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [99]:
from sklearn.metrics import classification_report

pipe = Pipeline([
    ('scaling', StandardScaler()),
    ('feature_selection', RFE(estimator=RandomForestClassifier(random_state=42))),
    ('classification', RandomForestClassifier(random_state=42))
])

param_grid = {
    'feature_selection__estimator__class_weight': ['balanced'],
    'classification__class_weight': ['balanced'],
    'feature_selection__n_features_to_select': [7,8,9,10],
    'classification__n_estimators': [98,99,100,101,102],
    'classification__max_depth': [None],
    'classification__min_samples_split': [4,5,6,7,8],
    'classification__min_samples_leaf': [1,2,3,4],
    'classification__bootstrap': [True],
}


grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring='f1_weighted', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Make predictions
y_pred = grid_search.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))



Best parameters: {'classification__bootstrap': True, 'classification__class_weight': 'balanced', 'classification__max_depth': None, 'classification__min_samples_leaf': 1, 'classification__min_samples_split': 4, 'classification__n_estimators': 101, 'feature_selection__estimator__class_weight': 'balanced', 'feature_selection__n_features_to_select': 8}
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           4       0.50      0.10      0.17        10
           5       0.75      0.79      0.77       123
           6       0.71      0.76      0.73       116
           7       0.68      0.64      0.66        36
           8       0.00      0.00      0.00         4

    accuracy                           0.72       291
   macro avg       0.44      0.38      0.39       291
weighted avg       0.70      0.72      0.70       291



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [100]:
# Count the number of each class
class_counts = df_new['quality'].value_counts()

# Calculate weights
class_weights = {cls: len(df) / count for cls, count in class_counts.items()}

print(class_weights)


{5: 2.6, 6: 2.761658031088083, 7: 8.785714285714286, 4: 33.3125, 8: 88.83333333333333, 3: 159.9}


In [101]:
pipe = Pipeline([
    ('scaling', StandardScaler()),
    ('feature_selection', RFE(estimator=RandomForestClassifier(random_state=42))),
    ('classification', RandomForestClassifier(random_state=42))
])

param_grid = {
    'feature_selection__estimator__class_weight': ['balanced',class_weights],
    'classification__class_weight': ['balanced',class_weights],
    'feature_selection__n_features_to_select': [7,8,9,10],
    'classification__n_estimators': [98,99,100,101,102],
    'classification__max_depth': [None],
    'classification__min_samples_split': [4,5,6,7,8],
    'classification__min_samples_leaf': [1,2,3,4],
    'classification__bootstrap': [True],
}


grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring='f1_weighted', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Make predictions
y_pred = grid_search.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))


Best parameters: {'classification__bootstrap': True, 'classification__class_weight': {5: 2.6, 6: 2.761658031088083, 7: 8.785714285714286, 4: 33.3125, 8: 88.83333333333333, 3: 159.9}, 'classification__max_depth': None, 'classification__min_samples_leaf': 1, 'classification__min_samples_split': 4, 'classification__n_estimators': 98, 'feature_selection__estimator__class_weight': 'balanced', 'feature_selection__n_features_to_select': 8}
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           4       0.50      0.10      0.17        10
           5       0.74      0.80      0.77       123
           6       0.70      0.73      0.71       116
           7       0.72      0.64      0.68        36
           8       0.00      0.00      0.00         4

    accuracy                           0.71       291
   macro avg       0.44      0.38      0.39       291
weighted avg       0.70      0.71      0.70       291



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [102]:
class_weights[3] *= 2
class_weights[8] *= 2

pipe = Pipeline([
    ('scaling', StandardScaler()),
    ('feature_selection', RFE(estimator=RandomForestClassifier(random_state=42))),
    ('classification', RandomForestClassifier(random_state=42))
])

param_grid = {
    'feature_selection__estimator__class_weight': ['balanced',class_weights],
    'classification__class_weight': ['balanced',class_weights],
    'feature_selection__n_features_to_select': [7,8,9,10],
    'classification__n_estimators': [98,99,100,101,102],
    'classification__max_depth': [None],
    'classification__min_samples_split': [4,5,6,7,8],
    'classification__min_samples_leaf': [1,2,3,4],
    'classification__bootstrap': [True],
}


grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring='f1_weighted', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Make predictions
y_pred = grid_search.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))


Best parameters: {'classification__bootstrap': True, 'classification__class_weight': 'balanced', 'classification__max_depth': None, 'classification__min_samples_leaf': 1, 'classification__min_samples_split': 4, 'classification__n_estimators': 102, 'feature_selection__estimator__class_weight': {5: 2.6, 6: 2.761658031088083, 7: 8.785714285714286, 4: 33.3125, 8: 177.66666666666666, 3: 319.8}, 'feature_selection__n_features_to_select': 8}
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           4       0.50      0.10      0.17        10
           5       0.75      0.79      0.77       123
           6       0.70      0.76      0.73       116
           7       0.72      0.64      0.68        36
           8       0.00      0.00      0.00         4

    accuracy                           0.72       291
   macro avg       0.44      0.38      0.39       291
weighted avg       0.70      0.72      0.70       291



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [103]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42,stratify=y)

pipe = Pipeline([
    ('scaling', StandardScaler()),
    ('feature_selection', RFE(estimator=RandomForestClassifier(random_state=42))),
    ('classification', RandomForestClassifier(random_state=42))
])

param_grid = {
    'feature_selection__estimator__class_weight': ['balanced'],
    'classification__class_weight': ['balanced'],
    'feature_selection__n_features_to_select': [7,8,9,10],
    'classification__n_estimators': [98,99,100,101,102],
    'classification__max_depth': [None],
    'classification__min_samples_split': [4,5,6,7,8],
    'classification__min_samples_leaf': [1,2,3,4],
    'classification__bootstrap': [True],
}


grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring='f1_weighted', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Make predictions
y_pred = grid_search.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))


Best parameters: {'classification__bootstrap': True, 'classification__class_weight': 'balanced', 'classification__max_depth': None, 'classification__min_samples_leaf': 2, 'classification__min_samples_split': 4, 'classification__n_estimators': 99, 'feature_selection__estimator__class_weight': 'balanced', 'feature_selection__n_features_to_select': 9}
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         3
           4       0.25      0.07      0.11        14
           5       0.73      0.79      0.76       185
           6       0.70      0.71      0.70       174
           7       0.72      0.71      0.72        55
           8       0.00      0.00      0.00         5

    accuracy                           0.71       436
   macro avg       0.40      0.38      0.38       436
weighted avg       0.69      0.71      0.70       436



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

pipe = Pipeline([
    ('scaling', StandardScaler()),
    ('feature_selection', RFE(estimator=RandomForestClassifier(random_state=42))),
    ('classification', RandomForestClassifier(random_state=42))
])

param_grid = {
    'feature_selection__estimator__class_weight': ['balanced'],
    'classification__class_weight': ['balanced'],
    'feature_selection__n_features_to_select': [7,8,9,10],
    'classification__n_estimators': [98,99,100,101,102],
    'classification__max_depth': [None],
    'classification__min_samples_split': [4,5,6,7,8],
    'classification__min_samples_leaf': [1,2,3,4],
    'classification__bootstrap': [True],
}


grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring='f1_weighted', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Make predictions
y_pred = grid_search.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))
