In [4]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
!pip install imblearn

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
# Read the CSV file
data = pd.read_csv('winequality-white.csv', sep=';')

# Plotting a histogram for each attribute
data.hist(figsize=(12, 8))
plt.tight_layout()
plt.suptitle('Histograms of Wine Dataset Attributes', y =1.05)
plt.show()

# Box plot for each attribute
data.plot(kind='box', figsize=(16, 8))
plt.tight_layout()
plt.title('Box Plots of Wine Dataset Attributes')
plt.show()

# Correlation matrix
correlation_matrix = data.corr()
plt.figure(figsize=(15, 8))
plt.imshow(correlation_matrix, cmap='coolwarm', interpolation='nearest')
plt.colorbar()
plt.xticks(range(len(correlation_matrix.columns)), correlation_matrix.columns, rotation=90)
plt.yticks(range(len(correlation_matrix.columns)), correlation_matrix.columns)
plt.title('Correlation Matrix of Wine Dataset Attributes')
plt.show()

# Scatter plots of attributes vs quality
for column in data.columns:
    if column != 'quality':
        plt.figure(figsize=(8, 6))
        plt.scatter(data[column], data['quality'])
        plt.xlabel(column)
        plt.ylabel('Quality')
        plt.title('Scatter Plot: Quality vs ' + column)
        plt.legend(['Wine Samples'])
        plt.show()


# Dimensionality reduction with PCA
pca = PCA(n_components=2)
pca_data = pca.fit_transform(data.drop('quality', axis=1))  # Exclude 'quality' from PCA

plt.figure(figsize=(8, 6))
scatter = plt.scatter(pca_data[:, 0], pca_data[:, 1], c=data['quality'])
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA: Dimensionality Reduction')
plt.legend(*scatter.legend_elements(), title="Quality")
plt.show()



In [None]:
#Summary of statistics for each attribute
# Read the CSV file
data = pd.read_csv('winequality-white.csv', sep=';')

# Calculate summary statistics for the dataset
summary_stats = data.describe()

# Calculate IQR for each column and append to the summary
iqr = data.quantile(0.75) - data.quantile(0.25)
summary_stats.loc['IQR'] = iqr
summary_stats.to_csv('summary_stats.csv')
# Print the summary statistics
print(summary_stats)

In [3]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Load the data
data = pd.read_csv('winequality-white.csv', sep=';')
X = data.drop('quality', axis=1)
y = data['quality']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

gbc_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('gbc', GradientBoostingClassifier())
])

gbc_param_grid = {
    'gbc__n_estimators': [100, 200, 300, 400, 500],
    'gbc__learning_rate': [0.01, 0.1, 0.2, 0.3],
    'gbc__max_depth': [3, 4, 5, 6, 7],
}

# Initialize RandomizedSearchCV
gbc_random_search = RandomizedSearchCV(gbc_pipeline, gbc_param_grid, cv=4, n_jobs=-1)

# Fit RandomizedSearchCV to the training data
gbc_random_search.fit(X_train, y_train)

# Predict and print classification report
y_pred = gbc_random_search.predict(X_test)
print(classification_report(y_test, y_pred))

print("Best parameters found:", gbc_random_search.best_params_)
print("Best cross-validated training data score found:", gbc_random_search.best_score_)


              precision    recall  f1-score   support

           3       0.20      0.20      0.20         5
           4       0.53      0.32      0.40        25
           5       0.69      0.71      0.70       291
           6       0.67      0.75      0.71       432
           7       0.73      0.58      0.65       192
           8       0.71      0.43      0.54        35
           9       0.00      0.00      0.00         0

    accuracy                           0.68       980
   macro avg       0.51      0.43      0.46       980
weighted avg       0.68      0.68      0.68       980

Best parameters found: {'gbc__n_estimators': 500, 'gbc__max_depth': 6, 'gbc__learning_rate': 0.1}
Best cross-validated training data score found: 0.6462456484125827


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
import pandas as pd

# Load the data
data = pd.read_csv('winequality-white.csv', sep=';')

X = data.drop('quality', axis=1)
y = data['quality']

# Set the test sizes and random state
test_sizes = [0.2, 0.3]
random_state = 42

# Prepare the pipeline
rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier(class_weight='balanced'))
])

# Prepare the parameter grid
rf_param_grid = {
    'rf__n_estimators': [100, 200, 300, 400, 500],
    'rf__max_depth': [None, 10, 20, 30, 40, 50],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4],
}

# Create a stratified K-fold cross-validator
strat_k_fold = StratifiedKFold(n_splits=4)

# Loop over the test sizes to evaluate each
for test_size in test_sizes:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    rf_grid_search = GridSearchCV(rf_pipeline, rf_param_grid, cv=strat_k_fold, n_jobs=-1)
    rf_grid_search.fit(X_train, y_train)

    best_score = rf_grid_search.best_score_
    best_params = rf_grid_search.best_params_

    best_model = rf_grid_search.best_estimator_
    test_score = best_model.score(X_test, y_test)

    # Make predictions on the test set
    y_pred = best_model.predict(X_test)
    print(classification_report(y_test, y_pred))
    print("Test size:", test_size)
    print("Random state:", random_state)
    print("Best parameters found:", best_params)
    print("Best cross-validated training data score found:", best_score)
    print("Test dataset score:", test_score)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           3       0.00      0.00      0.00         5
           4       0.60      0.24      0.34        25
           5       0.75      0.70      0.72       291
           6       0.66      0.82      0.73       432
           7       0.79      0.59      0.67       192
           8       0.94      0.46      0.62        35

    accuracy                           0.71       980
   macro avg       0.62      0.47      0.52       980
weighted avg       0.72      0.71      0.70       980

Test size: 0.2
Random state: 42
Best parameters found: {'rf__max_depth': 50, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2, 'rf__n_estimators': 400}
Best cross-validated training data score found: 0.6653882032894874
Test dataset score: 0.7091836734693877
              precision    recall  f1-score   support

           3       0.33      0.14      0.20         7
           4       0.50      0.20      0.29        40
           5       0.68      0.

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

# Load the data
data = pd.read_csv('winequality-white.csv', sep=';')
X = data.drop('quality', axis=1)
y = data['quality']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

mlp_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('mlp', MLPClassifier(max_iter=1000))
])

mlp_param_grid = {
    'mlp__hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'mlp__activation': ['tanh', 'relu'],
    'mlp__solver': ['sgd', 'adam'],
    'mlp__alpha': [0.0001, 0.05],
    'mlp__learning_rate': ['constant','adaptive'],
}

# Initialize StratifiedKFold
stratified_kfold = StratifiedKFold(n_splits=4)

mlp_grid_search = GridSearchCV(mlp_pipeline, mlp_param_grid, cv=stratified_kfold, n_jobs=-1)
mlp_grid_search.fit(X_train, y_train)

mlp_best_score = mlp_grid_search.best_score_
mlp_best_params = mlp_grid_search.best_params_

mlp_best_model = mlp_grid_search.best_estimator_
mlp_test_score = mlp_best_model.score(X_test, y_test)

# Predict and print classification report
y_pred = mlp_best_model.predict(X_test)
print(classification_report(y_test, y_pred))

print("Best parameters found:", mlp_best_params)
print("Best cross-validated training data score found:", mlp_best_score)
print("Test dataset score:", mlp_test_score)




              precision    recall  f1-score   support

           3       0.50      0.14      0.22         7
           4       0.36      0.25      0.29        40
           5       0.63      0.68      0.65       426
           6       0.68      0.64      0.66       668
           7       0.59      0.62      0.60       280
           8       0.40      0.43      0.42        49
           9       0.00      0.00      0.00         0

    accuracy                           0.63      1470
   macro avg       0.45      0.39      0.41      1470
weighted avg       0.63      0.63      0.63      1470

Best parameters found: {'mlp__activation': 'tanh', 'mlp__alpha': 0.05, 'mlp__hidden_layer_sizes': (50, 50, 50), 'mlp__learning_rate': 'adaptive', 'mlp__solver': 'adam'}
Best cross-validated training data score found: 0.5892648774795799
Test dataset score: 0.6278911564625851


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [5]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import pandas as pd

# Load the data
data = pd.read_csv('winequality-white.csv', sep=';')
X = data.drop('quality', axis=1)
y = data['quality']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

gbc_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('gbc', GradientBoostingClassifier())
])

gbc_param_grid = {
    'gbc__n_estimators': [100, 200, 300, 400, 500],
    'gbc__learning_rate': [0.01, 0.1, 0.2, 0.3],
    'gbc__max_depth': [3, 4, 5, 6, 7],
}

# Initialize StratifiedKFold
stratified_kfold = StratifiedKFold(n_splits=4)

# Initialize RandomizedSearchCV with StratifiedKFold
gbc_random_search = RandomizedSearchCV(gbc_pipeline, gbc_param_grid, cv=stratified_kfold, n_jobs=-1)

# Fit RandomizedSearchCV to the training data
gbc_random_search.fit(X_train, y_train)

# Predict and print classification report
y_pred = gbc_random_search.predict(X_test)
print(classification_report(y_test, y_pred))

print("Best parameters found:", gbc_random_search.best_params_)
print("Best cross-validated training data score found:", gbc_random_search.best_score_)


              precision    recall  f1-score   support

           3       0.25      0.20      0.22         5
           4       0.60      0.36      0.45        25
           5       0.70      0.68      0.69       291
           6       0.65      0.75      0.70       432
           7       0.72      0.58      0.65       192
           8       0.77      0.49      0.60        35
           9       0.00      0.00      0.00         0

    accuracy                           0.68       980
   macro avg       0.53      0.44      0.47       980
weighted avg       0.68      0.68      0.67       980

Best parameters found: {'gbc__n_estimators': 100, 'gbc__max_depth': 6, 'gbc__learning_rate': 0.2}
Best cross-validated training data score found: 0.6426692689333139


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
import pandas as pd

# Load the data
data = pd.read_csv('winequality-white.csv', sep=';')

X = data.drop('quality', axis=1)
y = data['quality']

# Set the test sizes and random state
test_sizes = [0.2, 0.3]
random_state = 42

# Prepare the pipeline
rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier(class_weight='balanced'))
])

# Prepare the parameter grid
rf_param_grid = {
    'rf__n_estimators': [350, 400, 450],
    'rf__max_depth': [25, 30, 35, 40, 45],
    'rf__min_samples_split': [2, 3],
    'rf__min_samples_leaf': [1, 2],
}


# Create a stratified K-fold cross-validator
strat_k_fold = StratifiedKFold(n_splits=4)

# Loop over the test sizes to evaluate each
for test_size in test_sizes:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    rf_grid_search = GridSearchCV(rf_pipeline, rf_param_grid, cv=strat_k_fold, n_jobs=-1)
    rf_grid_search.fit(X_train, y_train)

    best_score = rf_grid_search.best_score_
    best_params = rf_grid_search.best_params_

    best_model = rf_grid_search.best_estimator_
    test_score = best_model.score(X_test, y_test)

    # Make predictions on the test set
    y_pred = best_model.predict(X_test)
    print(classification_report(y_test, y_pred))
    print("Test size:", test_size)
    print("Random state:", random_state)
    print("Best parameters found:", best_params)
    print("Best cross-validated training data score found:", best_score)
    print("Test dataset score:", test_score)


              precision    recall  f1-score   support

           3       0.50      0.20      0.29         5
           4       0.67      0.24      0.35        25
           5       0.74      0.70      0.72       291
           6       0.67      0.81      0.73       432
           7       0.77      0.61      0.68       192
           8       0.94      0.46      0.62        35

    accuracy                           0.71       980
   macro avg       0.71      0.50      0.56       980
weighted avg       0.72      0.71      0.70       980

Test size: 0.2
Random state: 42
Best parameters found: {'rf__max_depth': 45, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 3, 'rf__n_estimators': 350}
Best cross-validated training data score found: 0.6641134748076963
Test dataset score: 0.7071428571428572
              precision    recall  f1-score   support

           3       0.33      0.14      0.20         7
           4       0.47      0.20      0.28        40
           5       0.68      0.

In [11]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbPipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import LabelBinarizer

# The rest of your imports remain the same

# Load the data
data = pd.read_csv('winequality-white.csv', sep=';')

X = data.drop('quality', axis=1)
y = data['quality']

# Set the test sizes and random state
test_sizes = [0.2, 0.3]
random_state = 42

rf_pipeline = imbPipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(k_neighbors=2, random_state=random_state)),  # reduce the number of neighbors
    ('rf', RandomForestClassifier(class_weight='balanced'))
])


# Prepare the parameter grid
rf_param_grid = {
    'rf__n_estimators': [100, 200, 300, 400, 500],
    'rf__max_depth': [None, 10, 20, 30, 40, 50],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4],
}

# Create a stratified K-fold cross-validator
strat_k_fold = StratifiedKFold(n_splits=4)

# Loop over the test sizes to evaluate each
for test_size in test_sizes:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    rf_grid_search = GridSearchCV(rf_pipeline, rf_param_grid, cv=strat_k_fold, n_jobs=-1)
    rf_grid_search.fit(X_train, y_train)

    best_score = rf_grid_search.best_score_
    best_params = rf_grid_search.best_params_

    best_model = rf_grid_search.best_estimator_
    test_score = best_model.score(X_test, y_test)

    # Make predictions on the test set
    y_pred = best_model.predict(X_test)

    # Print classification report
    print(classification_report(y_test, y_pred))

    # Print confusion matrix
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

    # Binarize the output for multiclass ROC-AUC
    lb = LabelBinarizer()
    y_test_bin = lb.fit_transform(y_test)
    y_pred_bin = lb.transform(y_pred)

    # Compute and print ROC-AUC
    print("ROC-AUC Score: ", roc_auc_score(y_test_bin, y_pred_bin, average='weighted'))

    print("Test size:", test_size)
    print("Random state:", random_state)
    print("Best parameters found:", best_params)
    print("Best cross-validated training data score found:", best_score)
    print("Test dataset score:", test_score)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           3       0.33      0.20      0.25         5
           4       0.33      0.40      0.36        25
           5       0.71      0.70      0.71       291
           6       0.69      0.67      0.68       432
           7       0.64      0.66      0.65       192
           8       0.41      0.49      0.45        35
           9       0.00      0.00      0.00         0

    accuracy                           0.66       980
   macro avg       0.45      0.45      0.44       980
weighted avg       0.67      0.66      0.66       980

Confusion Matrix:
 [[  1   0   1   3   0   0   0]
 [  1  10  10   4   0   0   0]
 [  0  11 205  68   6   1   0]
 [  1   8  68 289  55  10   1]
 [  0   1   4  47 127  13   0]
 [  0   0   1   5  12  17   0]
 [  0   0   0   0   0   0   0]]
ROC-AUC Score:  0.7521409186282444
Test size: 0.2
Random state: 42
Best parameters found: {'rf__max_depth': 50, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2,

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical

# Load the data
data = pd.read_csv('winequality-white.csv', sep=';')
X = data.drop('quality', axis=1).values
y = data['quality'].values

# One-hot encode the target column
y = to_categorical(y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the model
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))  # Input layer
model.add(Dense(32, activation='relu'))  # Hidden layer
model.add(Dense(y.shape[1], activation='softmax'))  # Output layer

# Compile the model
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Fit the model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100,
                    batch_size=32, callbacks=[early_stopping])

# Evaluate the model
_, train_acc = model.evaluate(X_train, y_train, verbose=0)
_, test_acc = model.evaluate(X_test, y_test, verbose=0)
print('Train: %.3f, Test: %.3f' % (train_acc, test_acc))




Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Train: 0.617, Test: 0.567


In [15]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Load your data
df = pd.read_csv('winequality-white.csv', sep=';')

# Perform Pearson correlation and select top 10 features
correlation = df.corr()['quality'].sort_values(ascending=False)
top_features = correlation.index[1:11]  # Excluding 'quality' itself

# Drop 'residual sugar' and 'free sulfur dioxide' (if they are in top_features)
if 'residual sugar' in top_features:
    top_features = top_features.drop('residual sugar')
if 'free sulfur dioxide' in top_features:
    top_features = top_features.drop('free sulfur dioxide')

X = df[top_features]
y = df['quality']

# Get the count of the least represented class
min_class_count = y.value_counts().min()

# Use SMOTE to oversample the minority classes
smote = SMOTE(k_neighbors=min_class_count-1 if min_class_count > 1 else 1, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)


# List of test sizes
test_sizes = [0.2, 0.3]

# Parameters for grid search
param_grid = {'C': [0.1, 1, 10, 100],
              'gamma': [1, 0.1, 0.01, 0.001],
              'kernel': ['rbf', 'poly', 'sigmoid']}

for test_size in test_sizes:
    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=test_size, random_state=42)

    # Standardize the data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Initialize an SVM model
    svm = SVC()

    # GridSearchCV
    grid = GridSearchCV(svm, param_grid, refit=True, verbose=3, n_jobs=-1)

    # Fit the grid
    grid.fit(X_train, y_train)

    # Print the best parameters
    print(f"Best parameters for test size {test_size}: {grid.best_params_}")

    # Make predictions with the best parameters and evaluate the model
    y_pred = grid.predict(X_test)
    print(f"Classification Report for test size {test_size}:")
    print(classification_report(y_test, y_pred))
    print("\n")


Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV 2/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.652 total time=   9.2s
[CV 4/5] END .....C=0.1, gamma=0.1, kernel=poly;, score=0.518 total time=   2.4s
[CV 4/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.486 total time=   3.4s
[CV 4/5] END .C=0.1, gamma=0.01, kernel=sigmoid;, score=0.470 total time=   2.8s
[CV 4/5] END ...C=0.1, gamma=0.001, kernel=poly;, score=0.145 total time=   2.9s
[CV 4/5] END C=0.1, gamma=0.001, kernel=sigmoid;, score=0.222 total time=   3.3s
[CV 2/5] END ......C=1, gamma=1, kernel=sigmoid;, score=0.168 total time=   4.1s
[CV 3/5] END ........C=1, gamma=0.1, kernel=rbf;, score=0.672 total time=   2.2s
[CV 4/5] END .......C=1, gamma=0.1, kernel=poly;, score=0.607 total time=   1.7s
[CV 4/5] END ....C=1, gamma=0.1, kernel=sigmoid;, score=0.316 total time=   2.8s
[CV 3/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.540 total time=   2.7s
[CV 3/5] END ......C=1, gamma=0.01, kernel=poly

KeyboardInterrupt: 

In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load the data
data = pd.read_csv('winequality-white.csv', sep=';')

# Correlation Analysis
correlation = data.corr()['quality'].drop('quality')
relevant_features = correlation[correlation.abs() > 0.05]

X = data[relevant_features.index]
y = data['quality']

# Define test sizes
test_sizes = [0.2, 0.3]

# Initialize pipeline
rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier())
])

# Define parameter grid
rf_param_grid = {
    'rf__n_estimators': [10, 50, 100, 200],
    'rf__max_depth': [None, 10, 20, 30, 40, 50],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4],
}

for test_size in test_sizes:
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    # Initialize RandomizedSearchCV
    rf_random_search = RandomizedSearchCV(rf_pipeline, rf_param_grid, cv=3, n_jobs=-1)

    # Fit RandomizedSearchCV to the training data
    rf_random_search.fit(X_train, y_train)

    # Predict and print classification report
    y_pred = rf_random_search.predict(X_test)
    print(f"Classification Report for test size {test_size}:")
    print(classification_report(y_test, y_pred))

    print("Best parameters found:", rf_random_search.best_params_)
    print("Best cross-validated training data score found:", rf_random_search.best_score_)
    print("\n")


Classification Report for test size 0.2:
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         5
           4       0.43      0.12      0.19        25
           5       0.68      0.68      0.68       291
           6       0.64      0.76      0.70       432
           7       0.74      0.57      0.64       192
           8       0.78      0.40      0.53        35

    accuracy                           0.67       980
   macro avg       0.54      0.42      0.46       980
weighted avg       0.67      0.67      0.66       980

Best parameters found: {'rf__n_estimators': 50, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 1, 'rf__max_depth': 40}
Best cross-validated training data score found: 0.6332312404287902




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report for test size 0.3:
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         7
           4       0.33      0.05      0.09        40
           5       0.68      0.69      0.69       426
           6       0.64      0.77      0.70       668
           7       0.69      0.52      0.59       280
           8       0.74      0.29      0.41        49

    accuracy                           0.66      1470
   macro avg       0.51      0.39      0.41      1470
weighted avg       0.65      0.66      0.64      1470

Best parameters found: {'rf__n_estimators': 50, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 2, 'rf__max_depth': 20}
Best cross-validated training data score found: 0.618433021324757




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Load the data
data = pd.read_csv('winequality-white.csv', sep=';')

# Correlation Analysis
correlation = data.corr()['quality'].drop('quality')
relevant_features = correlation[correlation.abs() > 0.05]

X = data[relevant_features.index]
y = data['quality']

# Define test sizes
test_sizes = [0.2, 0.3]

# Initialize pipeline
svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC())
])

# Define parameter grid
svm_param_grid = {
    'svm__C': np.logspace(-3, 2, num=6),
    'svm__kernel': ['linear', 'rbf'],
    'svm__gamma': np.logspace(-3, 2, num=6)
}

for test_size in test_sizes:
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    # Initialize RandomizedSearchCV
    svm_random_search = RandomizedSearchCV(svm_pipeline, svm_param_grid, cv=3, n_jobs=-1)

    # Fit RandomizedSearchCV to the training data
    svm_random_search.fit(X_train, y_train)

    # Predict and print classification report
    y_pred = svm_random_search.predict(X_test)
    print(f"Classification Report for test size {test_size}:")
    print(classification_report(y_test, y_pred))

    print("Best parameters found:", svm_random_search.best_params_)
    print("Best cross-validated training data score found:", svm_random_search.best_score_)
    print("\n")


Classification Report for test size 0.2:
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         5
           4       0.75      0.12      0.21        25
           5       0.93      0.39      0.55       291
           6       0.56      0.98      0.71       432
           7       0.99      0.42      0.59       192
           8       1.00      0.40      0.57        35

    accuracy                           0.65       980
   macro avg       0.71      0.39      0.44       980
weighted avg       0.77      0.65      0.62       980

Best parameters found: {'svm__kernel': 'rbf', 'svm__gamma': 10.0, 'svm__C': 10.0}
Best cross-validated training data score found: 0.587034201123022




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report for test size 0.3:
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         7
           4       1.00      0.05      0.10        40
           5       1.00      0.29      0.45       426
           6       0.54      1.00      0.70       668
           7       1.00      0.31      0.47       280
           8       1.00      0.35      0.52        49

    accuracy                           0.61      1470
   macro avg       0.76      0.33      0.37      1470
weighted avg       0.79      0.61      0.56      1470

Best parameters found: {'svm__kernel': 'rbf', 'svm__gamma': 100.0, 'svm__C': 1.0}
Best cross-validated training data score found: 0.5504678596436392




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [3]:

import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Load your data
df = pd.read_csv('winequality-white.csv', sep=';')

# Perform Pearson correlation and select top 10 features
correlation = df.corr()['quality'].sort_values(ascending=False)
top_features = correlation.index[1:11]  # Excluding 'quality' itself

# Drop 'residual sugar' and 'free sulfur dioxide' (if they are in top_features)
if 'residual sugar' in top_features:
    top_features = top_features.drop('residual sugar')
if 'free sulfur dioxide' in top_features:
    top_features = top_features.drop('free sulfur dioxide')

X = df[top_features]
y = df['quality']

# Get the count of the least represented class
min_class_count = y.value_counts().min()

# Use SMOTE to oversample the minority classes
smote = SMOTE(k_neighbors=min_class_count-1 if min_class_count > 1 else 1, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# List of test sizes
test_sizes = [0.2, 0.3]

# Parameters for grid search
param_grid = {'n_neighbors': list(range(1,31)),
              'weights': ['uniform', 'distance'],
              'metric': ['euclidean', 'manhattan', 'minkowski']}

for test_size in test_sizes:
    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=test_size, random_state=42)

    # Standardize the data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Initialize a KNN model
    knn = KNeighborsClassifier()

    # GridSearchCV
    grid = GridSearchCV(knn, param_grid, refit=True, verbose=3, n_jobs=-1)

    # Fit the grid
    grid.fit(X_train, y_train)

    # Print the best parameters
    print(f"Best parameters for test size {test_size}: {grid.best_params_}")

    # Make predictions with the best parameters and evaluate the model
    y_pred = grid.predict(X_test)
    print(f"Classification Report for test size {test_size}:")
    print(classification_report(y_test, y_pred))
    print("\n")


ModuleNotFoundError: No module named 'imblearn'