In [167]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel, SelectKBest, chi2, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import LinearSVC
import pandas as pd
import numpy as np
from sklearn.utils import resample
from scipy.stats import zscore

# Load the dataset
df = pd.read_csv('winequality-red.csv', header = 0, delimiter=';')

# Splitting data by class
df_3 = df[df.quality ==3]
df_4 = df[df.quality ==4]
df_5 = df[df.quality ==5]
df_6 = df[df.quality ==6]
df_7 = df[df.quality ==7]
df_8 = df[df.quality ==8]

# Defining function that outputs a dataframe based on meeting the 3 standard deviation criteria 
def remove_outliers(df):
    z = np.abs(zscore(df.drop(['quality'], axis=1)))
    df_out = df[(z < 3).all(axis=1)]
    return df_out

# Applying the function to each dataframe
df_3_new = remove_outliers(df_3)
df_4_new = remove_outliers(df_4)
df_5_new = remove_outliers(df_5)
df_6_new = remove_outliers(df_6)
df_7_new = remove_outliers(df_7)
df_8_new = remove_outliers(df_8)

# Concatenating all new dataframes by row
df_new = pd.concat([df_3_new,df_4_new,df_5_new,df_6_new,df_7_new,df_8_new],axis=0)

# Prepare the data
X = df_new.drop('quality', axis=1)
y = df_new['quality']

# Spltting the data set 80/20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

In [168]:
# Prints out the new data set with the new number of counts, min, max, mean and STD.
df_new.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1448.0,1448.0,1448.0,1448.0,1448.0,1448.0,1448.0,1448.0,1448.0,1448.0,1448.0,1448.0
mean,8.30884,0.524997,0.264448,2.386809,0.081497,15.163674,44.161602,0.996717,3.316133,0.64174,10.408943,5.63605
std,1.638422,0.172313,0.190651,0.863379,0.020699,9.332461,30.125723,0.001732,0.140308,0.129001,1.012177,0.810777
min,4.9,0.12,0.0,1.2,0.012,1.0,6.0,0.99064,2.88,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.071,7.0,21.0,0.9956,3.22,0.55,9.5,5.0
50%,7.9,0.52,0.25,2.2,0.079,13.0,36.0,0.9967,3.31,0.62,10.2,6.0
75%,9.2,0.635,0.42,2.6,0.089,21.0,59.0,0.9978,3.4,0.72,11.0,6.0
max,13.7,1.58,0.76,6.7,0.267,48.0,155.0,1.0026,3.74,1.13,14.0,8.0


In [153]:
from sklearn.metrics import classification_report

# Defining a pipeline with standard scaling, RFE feature selection using RFC, and RFC classification
pipe = Pipeline([
    ('scaling', StandardScaler()),
    ('feature_selection', RFE(estimator=RandomForestClassifier(random_state=42,class_weight='balanced'))),
    ('classification', RandomForestClassifier(random_state=42,class_weight='balanced'))
])

# Defining a dictionary parameter grid to test out for the feature selection and classification parameters
param_grid = {
    'feature_selection__n_features_to_select': [5,6,7,8,9,10,11],
    'classification__n_estimators': [98,99,100,101,102],
    'classification__max_depth': [None],
    'classification__min_samples_split': [1,2,3,4,5,6],
    'classification__min_samples_leaf': [1,2,3,4],
    'classification__bootstrap': [True],
}

# Defining the grid search with the pipeline, parameter grid, and weighted scoring
grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring='f1_weighted', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Obtaining best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Obtaining prediction results
y_pred = grid_search.predict(X_test)

# Printing the classification report
print(classification_report(y_test, y_pred))


Best parameters: {'classification__bootstrap': True, 'classification__max_depth': None, 'classification__min_samples_leaf': 1, 'classification__min_samples_split': 4, 'classification__n_estimators': 102, 'feature_selection__n_features_to_select': 6}
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           4       0.50      0.10      0.17        10
           5       0.67      0.76      0.71       123
           6       0.63      0.64      0.63       116
           7       0.73      0.61      0.67        36
           8       0.00      0.00      0.00         3

    accuracy                           0.66       290
   macro avg       0.42      0.35      0.36       290
weighted avg       0.64      0.66      0.64       290



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [154]:
from sklearn.metrics import classification_report

# Defining a pipeline with standard scaling, RFE feature selection using RFC, and RFC classification
pipe = Pipeline([
    ('scaling', StandardScaler()),
    ('feature_selection', RFE(estimator=RandomForestClassifier(random_state=42,class_weight='balanced'))),
    ('classification', RandomForestClassifier(random_state=42,class_weight='balanced'))
])

# Defining a dictionary parameter grid to test out for the feature selection and classification parameters. This is altered from above to see what happenes when we remove the option of all features
param_grid = {
    'feature_selection__n_features_to_select': [7,8,9,10],
    'classification__n_estimators': [98,99,100,101,102],
    'classification__max_depth': [None],
    'classification__min_samples_split': [4,5,6],
    'classification__min_samples_leaf': [1,2,3,4],
    'classification__bootstrap': [True],
}

grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring='f1_weighted', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Make predictions
y_pred = grid_search.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))


Best parameters: {'classification__bootstrap': True, 'classification__max_depth': None, 'classification__min_samples_leaf': 2, 'classification__min_samples_split': 4, 'classification__n_estimators': 99, 'feature_selection__n_features_to_select': 8}
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           4       0.25      0.10      0.14        10
           5       0.68      0.80      0.74       123
           6       0.68      0.64      0.66       116
           7       0.69      0.61      0.65        36
           8       0.00      0.00      0.00         3

    accuracy                           0.68       290
   macro avg       0.38      0.36      0.36       290
weighted avg       0.66      0.68      0.66       290



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [170]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel, SelectKBest, chi2, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import LinearSVC
import pandas as pd
import numpy as np
from sklearn.utils import resample
from scipy.stats import zscore

# Loading redwine quality data
df = pd.read_csv('winequality-red.csv', header = 0, delimiter=';')

# Splitting data by class
df_3 = df[df.quality ==3]
df_4 = df[df.quality ==4]
df_5 = df[df.quality ==5]
df_6 = df[df.quality ==6]
df_7 = df[df.quality ==7]
df_8 = df[df.quality ==8]

# Defining a function that removes outliers by IQR out side of the area between the 1st and 3rd quartile
def remove_outliers(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    # This looks for any value that is within the 1st and 3rd quartile, and negates the boolean value to return rows that are not within it
    new_df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]
    return new_df

# Applying the function to each dataframe
df_3_new = remove_outliers(df_3)
df_4_new = remove_outliers(df_4)
df_5_new = remove_outliers(df_5)
df_6_new = remove_outliers(df_6)
df_7_new = remove_outliers(df_7)
df_8_new = remove_outliers(df_8)

# Concatenating all new dataframes by row
df_new = pd.concat([df_3_new,df_4_new,df_5_new,df_6_new,df_7_new,df_8_new],axis=0)

# Prepare the data
X = df_new.drop('quality', axis=1)
y = df_new['quality']

# Spltting the data set 80/20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

In [171]:
df_new.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1166.0,1166.0,1166.0,1166.0,1166.0,1166.0,1166.0,1166.0,1166.0,1166.0,1166.0,1166.0
mean,8.207633,0.522316,0.254657,2.195369,0.078858,15.247856,44.070326,0.996645,3.319605,0.631955,10.314994,5.626072
std,1.452245,0.166501,0.181233,0.441612,0.013857,8.828036,29.413323,0.001541,0.131017,0.115711,0.944769,0.766782
min,5.2,0.12,0.0,1.2,0.038,1.0,6.0,0.99191,2.94,0.33,8.7,3.0
25%,7.2,0.39,0.09,1.9,0.07,8.0,23.0,0.9956,3.23,0.55,9.5,5.0
50%,7.9,0.52,0.24,2.1,0.078,14.0,36.0,0.996675,3.32,0.61,10.033333,6.0
75%,9.0,0.63,0.4,2.5,0.087,21.0,58.0,0.9976,3.4,0.7,11.0,6.0
max,13.3,1.185,0.75,4.25,0.123,43.0,153.0,1.001,3.69,1.05,13.6,8.0


In [157]:
from sklearn.metrics import classification_report

# Defining a pipeline with standard scaling, RFE feature selection using RFC, and RFC classification
pipe = Pipeline([
    ('scaling', StandardScaler()),
    ('feature_selection', RFE(estimator=RandomForestClassifier(random_state=42,class_weight='balanced'))),
    ('classification', RandomForestClassifier(random_state=42,class_weight='balanced'))
])

# Defining a dictionary parameter grid to test out for the feature selection and classification parameters
param_grid = {
    'feature_selection__n_features_to_select': [5,6,7,8,9,10,11],
    'classification__n_estimators': [98,99,100,101,102],
    'classification__max_depth': [None],
    'classification__min_samples_split': [1,2,3,4,5,6],
    'classification__min_samples_leaf': [1,2,3,4],
    'classification__bootstrap': [True],
}

# Defining the grid search with the pipeline, parameter grid, and weighted scoring
grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring='f1_weighted', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Obtaining best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Obtaining prediction results
y_pred = grid_search.predict(X_test)

# Printing the classification report
print(classification_report(y_test, y_pred))




Best parameters: {'classification__bootstrap': True, 'classification__max_depth': None, 'classification__min_samples_leaf': 1, 'classification__min_samples_split': 4, 'classification__n_estimators': 99, 'feature_selection__n_features_to_select': 11}
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         7
           5       0.73      0.86      0.79       100
           6       0.72      0.72      0.72        98
           7       0.82      0.54      0.65        26
           8       1.00      0.50      0.67         2

    accuracy                           0.74       234
   macro avg       0.55      0.44      0.47       234
weighted avg       0.71      0.74      0.72       234



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [158]:
from sklearn.metrics import classification_report

# Defining a pipeline with standard scaling, RFE feature selection using RFC, and RFC classification
pipe = Pipeline([
    ('scaling', StandardScaler()),
    ('feature_selection', RFE(estimator=RandomForestClassifier(random_state=42,class_weight='balanced'))),
    ('classification', RandomForestClassifier(random_state=42,class_weight='balanced'))
])

# Defining a dictionary parameter grid to test out for the feature selection and classification parameters. This is altered from above to see what happenes when we remove the option of all features
param_grid = {
    'feature_selection__n_features_to_select': [7,8,9,10],
    'classification__n_estimators': [98,99,100,101,102],
    'classification__max_depth': [None],
    'classification__min_samples_split': [4,5,6],
    'classification__min_samples_leaf': [1,2,3,4],
    'classification__bootstrap': [True],
}

grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring='f1_weighted', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Make predictions
y_pred = grid_search.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))





Best parameters: {'classification__bootstrap': True, 'classification__max_depth': None, 'classification__min_samples_leaf': 2, 'classification__min_samples_split': 6, 'classification__n_estimators': 100, 'feature_selection__n_features_to_select': 10}
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.50      0.14      0.22         7
           5       0.70      0.86      0.77       100
           6       0.74      0.68      0.71        98
           7       0.83      0.58      0.68        26
           8       1.00      0.50      0.67         2

    accuracy                           0.73       234
   macro avg       0.63      0.46      0.51       234
weighted avg       0.73      0.73      0.72       234



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [184]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel, SelectKBest, chi2, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import LinearSVC
import pandas as pd
import numpy as np
from sklearn.utils import resample
from scipy.stats import zscore

# Loading redwine quality data
df = pd.read_csv('winequality-red.csv', header = 0, delimiter=';')

# Splitting data by class
df_3 = df[df.quality ==3]
df_4 = df[df.quality ==4]
df_5 = df[df.quality ==5]
df_6 = df[df.quality ==6]
df_7 = df[df.quality ==7]
df_8 = df[df.quality ==8]

def remove_outliers(df):
    # Calculates mean and standard deviation of the input df
    mean = df.mean()
    std = df.std()

    # creates a new dataframe for rows that are within their corresponding standard deviation
    new_df = df[(np.abs(df - mean) <= 3*std).all(axis=1)]

    # Return the dataframe with outliers removed
    return new_df

# Applying the function to each dataframe
df_3_new = remove_outliers(df_3)
df_4_new = remove_outliers(df_4)
df_5_new = remove_outliers(df_5)
df_6_new = remove_outliers(df_6)
df_7_new = remove_outliers(df_7)
df_8_new = remove_outliers(df_8)

# Concatenating all new dataframes by row
df_new = pd.concat([df_3_new,df_4_new,df_5_new,df_6_new,df_7_new,df_8_new],axis=0)

# Prepare the data
X = df_new.drop('quality', axis=1)
y = df_new['quality']

# Spltting the data set 80/20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

In [187]:
df_new.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1452.0,1452.0,1452.0,1452.0,1452.0,1452.0,1452.0,1452.0,1452.0,1452.0,1452.0,1452.0
mean,8.304821,0.525475,0.26407,2.388464,0.081448,15.15427,44.151515,0.996711,3.316756,0.641839,10.414153,5.639807
std,1.641415,0.172812,0.190776,0.869068,0.020704,9.323774,30.1217,0.001735,0.141002,0.129084,1.016209,0.814505
min,4.9,0.12,0.0,1.2,0.012,1.0,6.0,0.99064,2.88,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07075,7.0,21.0,0.9956,3.22,0.55,9.5,5.0
50%,7.9,0.52,0.25,2.2,0.079,13.0,36.0,0.9967,3.31,0.62,10.2,6.0
75%,9.2,0.635,0.42,2.6,0.089,21.0,59.0,0.9978,3.4,0.72,11.075,6.0
max,13.7,1.58,0.76,6.7,0.267,48.0,155.0,1.0026,3.78,1.13,14.0,8.0


In [188]:
from sklearn.metrics import classification_report

# Defining a pipeline with standard scaling, RFE feature selection using RFC, and RFC classification
pipe = Pipeline([
    ('scaling', StandardScaler()),
    ('feature_selection', RFE(estimator=RandomForestClassifier(random_state=42,class_weight='balanced'))),
    ('classification', RandomForestClassifier(random_state=42,class_weight='balanced'))
])

# Defining a dictionary parameter grid to test out for the feature selection and classification parameters
param_grid = {
    'feature_selection__n_features_to_select': [5,6,7,8,9,10,11],
    'classification__n_estimators': [98,99,100,101,102],
    'classification__max_depth': [None],
    'classification__min_samples_split': [1,2,3,4,5,6],
    'classification__min_samples_leaf': [1,2,3,4],
    'classification__bootstrap': [True],
}

# Defining the grid search with the pipeline, parameter grid, and weighted scoring
grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring='f1_weighted', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Obtaining best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Obtaining prediction results
y_pred = grid_search.predict(X_test)

# Printing the classification report
print(classification_report(y_test, y_pred))

Best parameters: {'classification__bootstrap': True, 'classification__max_depth': None, 'classification__min_samples_leaf': 1, 'classification__min_samples_split': 3, 'classification__n_estimators': 99, 'feature_selection__n_features_to_select': 10}
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           4       0.00      0.00      0.00        10
           5       0.72      0.80      0.76       123
           6       0.70      0.73      0.72       116
           7       0.74      0.64      0.69        36
           8       0.00      0.00      0.00         4

    accuracy                           0.71       291
   macro avg       0.36      0.36      0.36       291
weighted avg       0.68      0.71      0.69       291



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [189]:
from sklearn.metrics import classification_report

# Defining a pipeline with standard scaling, RFE feature selection using RFC, and RFC classification
pipe = Pipeline([
    ('scaling', StandardScaler()),
    ('feature_selection', RFE(estimator=RandomForestClassifier(random_state=42,class_weight='balanced'))),
    ('classification', RandomForestClassifier(random_state=42,class_weight='balanced'))
])

# Defining a dictionary parameter grid to test out for the feature selection and classification parameters. This is altered from above to see what happenes when we remove the option of all features
param_grid = {
    'feature_selection__n_features_to_select': [7,8,9,10],
    'classification__n_estimators': [98,99,100,101,102],
    'classification__max_depth': [None],
    'classification__min_samples_split': [4,5,6],
    'classification__min_samples_leaf': [1,2,3,4],
    'classification__bootstrap': [True],
}

grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring='f1_weighted', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Make predictions
y_pred = grid_search.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))


Best parameters: {'classification__bootstrap': True, 'classification__max_depth': None, 'classification__min_samples_leaf': 1, 'classification__min_samples_split': 4, 'classification__n_estimators': 101, 'feature_selection__n_features_to_select': 8}
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           4       0.50      0.10      0.17        10
           5       0.75      0.79      0.77       123
           6       0.71      0.76      0.73       116
           7       0.68      0.64      0.66        36
           8       0.00      0.00      0.00         4

    accuracy                           0.72       291
   macro avg       0.44      0.38      0.39       291
weighted avg       0.70      0.72      0.70       291



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
