In [None]:
# Create the 'opinion' variable based on 'quality'
data['opinion'] = data['quality'].apply(lambda x: 0 if x <= 5 else 1)

In [None]:
# Describe the variables present in the dataset
variables = {
    'fixed acidity': 'continuous',
    'volatile acidity': 'continuous',
    'citric acid': 'continuous',
    'residual sugar': 'continuous',
    'chlorides': 'continuous',
    'free sulfur dioxide': 'continuous',
    'total sulfur dioxide': 'continuous',
    'density': 'continuous',
    'pH': 'continuous',
    'sulphates': 'continuous',
    'alcohol': 'continuous',
    'quality': 'discrete',
    'opinion': 'categorical'
}

means = data.mean()
stds = data.std()


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Select features and target variable
X = data.drop(['quality', 'opinion'], axis=1)
y = data['opinion']

# Initialize logistic regression model
logreg = LogisticRegression()

# Perform stratified cross-validation
cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)

# Calculate metrics using cross_validate
scoring = ['accuracy', 'precision', 'recall', 'f1']
results = cross_validate(logreg, X, y, cv=cv, scoring=scoring)

# Calculate means and standard deviations of the obtained metrics
mean_accuracy = results['test_accuracy'].mean()
std_accuracy = results['test_accuracy'].std()

mean_precision = results['test_precision'].mean()
std_precision = results['test_precision'].std()

mean_recall = results['test_recall'].mean()
std_recall = results['test_recall'].std()

mean_f1 = results['test_f1'].mean()
std_f1 = results['test_f1'].std()

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Initialize decision tree model
dtree = DecisionTreeClassifier()

# Perform stratified cross-validation
results = cross_validate(dtree, X, y, cv=cv, scoring=scoring)

# Calculate means and standard deviations of the obtained metrics
mean_accuracy_dtree = results['test_accuracy'].mean()
std_accuracy_dtree = results['test_accuracy'].std()

mean_precision_dtree = results['test_precision'].mean()
std_precision_dtree = results['test_precision'].std()

mean_recall_dtree = results['test_recall'].mean()
std_recall_dtree = results['test_recall'].std()

mean_f1_dtree = results['test_f1'].mean()
std_f1_dtree = results['test_f1'].std()


In [None]:
from sklearn.svm import SVC

# Initialize SVM model
svm = SVC()

# Perform stratified cross-validation
results = cross_validate(svm, X, y, cv=cv, scoring=scoring)

# Calculate means and standard deviations of the obtained metrics
mean_accuracy_svm = results['test_accuracy'].mean()
std_accuracy_svm = results['test_accuracy'].std()

mean_precision_svm = results['test_precision'].mean()
std_precision_svm = results['test_precision'].std()

mean_recall_svm = results['test_recall'].mean()
std_recall_svm = results['test_recall'].std()

mean_f1_svm = results['test_f1'].mean()
std_f1_svm = results['test_f1'].std()


In [None]:
from sklearn.metrics import plot_roc_curve

# Plot average ROC curves for each model
plt.figure(figsize=(10, 6))

# Logistic Regression
logreg_disp = plot_roc_curve(logreg, X, y, name='Logistic Regression')

# Decision Tree
dtree_disp = plot_roc_curve(dtree, X, y, ax=plt.gca(), name='Decision Tree')

# SVM
svm_disp = plot_roc_curve(svm, X, y, ax=plt.gca(), name='SVM')

# Add labels and title
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Average ROC Curve Comparison')

# Show the plot
plt.show()


In [None]:
# Filter red wine data
red_wine = data[data['type'] == 'red'].copy()

# Split red wine data
X_red = red_wine.drop(['quality', 'opinion'], axis=1)
y_red = red_wine['opinion']

# Scale red wine data using the same scaler from white wine data
X_red_scaled = scaler.transform(X_red)

# Make predictions using the logistic regression model
y_pred_red = logreg.predict(X_red_scaled)

# Calculate metrics for red wine data
accuracy_red = accuracy_score(y_red, y_pred_red)
precision_red = precision_score(y_red, y_pred_red)
recall_red = recall_score(y_red, y_pred_red)
f1_score_red = f1_score(y_red, y_pred_red)
