In [34]:
#Importing Dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier , DescisionTreeClassifier
from sklearn.metrics import accuracy_score

#Loading dataset to Pandas DataFrame
wine_data = pd.read_csv('winequalityN.csv')


#Drop the 'type' column if it exists
if 'type' in wine_data.columns:
    wine_data = wine_data.drop('type', axis=1)

#Number of rows and columns in the Dataset
wine_data.shape

#Check missing values in the dataset
wine_data.isnull().sum()

#Handle missing values by imputing with the mean
columns_to_impute = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'pH', 'sulphates']
for column in columns_to_impute:
    wine_data[column].fillna(wine_data[column].mean(), inplace=True)

#Statistical measures of the dataset
wine_data.describe()

#Volatile acidity vs Quality
plt.figure(figsize=(4, 4))
sns.barplot(x='quality', y='volatile acidity', data=wine_data, palette="inferno")
plt.title('Volatile Acidity vs Quality')
plt.show()

#Citric acid vs Quality
plot = plt.figure(figsize=(4,4))
sns.barplot(x='quality' , y='citric acid' , data=wine_data, palette="magma")
plt.title('Citric Acid vs Quality')
plt.show()

#Chlorides vs Quality
plot = plt.figure(figsize=(4,4))
sns.barplot(x='quality' , y='chlorides' , data=wine_data, palette="dark")
plt.title('Chlorides vs Quality')
plt.show()

#Alcohol vs Quality
plot = plt.figure(figsize=(4,4))
sns.barplot(x='quality' , y='alcohol' , data=wine_data, palette="viridis")
plt.title('Alchol vs Quality')
plt.show()

#pH vs Quality
plot = plt.figure(figsize=(4,4))
sns.barplot(x='quality' , y='pH' , data=wine_data, palette="bright")
plt.title('pH vs Quality')
plt.show()

#Label Binarization/Encoding
Y = wine_data['quality'].apply(lambda y_value: 1 if y_value>=7 else 0)
print(Y)

X= wine_data.drop('quality',axis=1)
print(X)
#Train and Test Split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

#Model Training:

#Random Forest Classifier
model = RandomForestClassifier()
model.fit(X_train, Y_train)

#Model Evualuation:

#Accuracy Score
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy : ', test_data_accuracy)

input_data = (6.3,0.3,0.34,1.6,0.049,14,132,0.994,3.3,0.49,9.5)

#changing the input data into numpy array
input_data_as_numpy_array = np.asarray(input_data)

#reshape the data as we are predicting the label for only one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]==1):
  print('Good Quality Wine')
else:
  print('Bad Quality Wine')


ImportError: cannot import name 'DescisionTreeClassifier' from 'sklearn.ensemble' (/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/__init__.py)

In [36]:
# Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier

# Create and train the Decision Tree model
dt_model = DecisionTreeClassifier(random_state=2)
dt_model.fit(X_train, Y_train)

# Make predictions on the test set
dt_test_prediction = dt_model.predict(X_test)

# Calculate accuracy
dt_test_data_accuracy = accuracy_score(dt_test_prediction, Y_test)
print('Decision Tree Accuracy:', dt_test_data_accuracy)

# Compare with Random Forest accuracy
print('Random Forest Accuracy:', test_data_accuracy)

#Random Forest has more accuracy than Decision Tree so it is the best option

Decision Tree Accuracy: 0.8423076923076923
Random Forest Accuracy: 0.8884615384615384
