In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
file_path = "Downloads/winequality-red.csv"
try:
    # Adjusting the delimiter if necessary
    data = pd.read_csv(file_path, sep=',')  # Change delimiter as per dataset
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("File not found! Please check the file path and try again.")
    exit()

# Display the first few rows of the dataset
print("\nFirst 5 rows of the dataset:")
print(data.head())

# Ensure column names are correct
expected_columns = [
    'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides',
    'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality'
]
if len(data.columns) == len(expected_columns):
    data.columns = expected_columns
    print("\nColumn names assigned successfully!")
else:
    print("\nError: Column mismatch. Expected", len(expected_columns), "columns but got", len(data.columns))
    print("Current columns:", data.columns)
    exit()

# Strip any leading/trailing spaces from column names
data.columns = data.columns.str.strip()

# Display dataset info
print("\nDataset Description:")
print(data.describe())
print("\nColumns in the dataset:")
print(data.columns)

# Define features and target variable
X = data.drop('quality', axis=1)
y = data['quality']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Naive Bayes": GaussianNB()
}

# Train and evaluate each model
print("\nModel Evaluation:")
for model_name, model in models.items():
    try:
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Evaluate the model
        print(f"\n{model_name} Results:")
        print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
        print("Classification Report:")
        print(classification_report(y_test, y_pred))
    except Exception as e:
        print(f"\nError occurred while training {model_name}: {e}")

Dataset loaded successfully!

First 5 rows of the dataset:
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
