### 1. Data Loading
### 2. Exploratory Data Analysis
### 3. Feature Selection
### 4. Model Training
### 5. Model Evaluation
### 6. Conclusion


Importing the Dependencies


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


Data Collection


In [None]:
# loading the dataset to a Pandas Dataframe
wine_dataset=pd.read_csv('winequality-red.csv')


In [None]:
# number of rows and columns in the datset
wine_dataset.shape

In [None]:
# first 5 rows of the datset
wine_dataset.head()

In [None]:
# checing for missing values
wine_dataset.isnull().sum()

Data Analysis and Visualisation


In [None]:
#statistical measures of the dataset
wine_dataset.describe()

In [None]:
# number of values for each quality
sns.catplot(x='quality', data=wine_dataset, kind='count')

In [None]:
# volatile acidity vs Quality
plot=plt.figure(figsize=(5,5))
sns.barplot(x='quality', y='volatile acidity', data=wine_dataset)

In [None]:
# citric acid vs Quality
plot=plt.figure(figsize=(5,5))
sns.barplot(x='quality', y='citric acid', data=wine_dataset)

In [None]:
# fixed acidity vs Quality
plot=plt.figure(figsize=(5,5))
sns.barplot(x='quality', y='fixed acidity', data=wine_dataset)

Correlation

1. Positive Correlation
2. Negative Correlation

In [None]:
correlation= wine_dataset.corr()

In [None]:
# constructing a heatmap to understand the correlation betweeen the columns
plt.figure(figsize=(11,11))
sns.heatmap(correlation, cbar=True, square=True, fmt='.1f', annot=True, annot_kws={'size':8}, cmap='Purples')

Data Preprocessing

In [None]:
# separate the data and label
X=wine_dataset.drop('quality', axis=1)

Label Binarisation

In [None]:
Y=wine_dataset['quality'].apply(lambda y_value: 1 if y_value>=7 else 0)

Train and Test Data

In [None]:
X_train, X_test, Y_train, Y_test=train_test_split(X,Y, test_size=0.2, random_state=3)

In [None]:
print(Y.shape, Y_train.shape, Y_test.shape)

Model Training:


Random Forest Classifier


In [None]:
model=RandomForestClassifier()

In [None]:
model.fit(X_train, Y_train)

Model Evaluation

Accuracy Score

In [None]:
# accuracy on test data
X_test_prediction=model.predict(X_test)
test_data_accuracy=accuracy_score(X_test_prediction, Y_test)

In [None]:
print("Accuracy: ", test_data_accuracy)

Building a Predictive System

In [None]:
input_data=(7.3, 0.65, 0.0,1.2,0.065, 15.0, 21.0,0.9946,3.39, 0.47, 10.0)

# changing the input data to a numpy array
input_data_as_numpy_array=np.asarray(input_data)

#reshape the data as we are predicting the label for only one instance
input_data_reshaped=input_data_as_numpy_array.reshape(1,-1)

prediction=model.predict(input_data_reshaped)
print(prediction)

if(prediction[0])==1:
  print("Good Quality Wine")
else:
  print("Bad Quality Wine")

In [None]:
# To avoid the warning, convert the input data to a DataFrame with feature names
# Get feature names from X (which was used to create X_train)
feature_names = X.columns

# Create a DataFrame from the input data with these feature names
input_df = pd.DataFrame([input_data], columns=feature_names)

prediction=model.predict(input_df)
print(prediction)

if(prediction[0])==1:
  print("Good Quality Wine")
else:
  print("Bad Quality Wine")

### Conclusion
Alcohol and volatile acidity were found to have the strongest influence on wine quality.
The trained model achieved reasonable accuracy and can be further improved using
hyperparameter tuning and advanced models.
