# WINE QUALITY PREDICTION

1. Here, I'm developing a Machine Learning Model (Specially using Linear Regression) to predict the quality of the wine.

2. To Train and Test the model, I use dataset called winequality-red from "kaggle.com".

2. I'll use some basic Machine Learnig concept like Data Explotory Analyisis, PCA, Model selection and python libraries to build predictive model. So, I'll mention each concept along with code.

Let's Start this Model Building Journey!!!

### Step 1: Importing Python Libraries

In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

### Step 2: Dataset Loading

In [37]:
winedata = pd.read_csv(r"C:\Users\HI\Downloads\winequality-red.csv")
winedata.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [38]:
winedata.shape

(1599, 12)

### Data Visualization and Analysis

In [39]:
#Labled data
winedata.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [40]:
sns.pairplot(winedata)

<seaborn.axisgrid.PairGrid at 0x1c951bb4c70>

### Step 3: Data Pre-processing and Feature Selection

In [41]:
correlations =winedata.corr()['quality'].drop('quality')
print(correlations)

fixed acidity           0.124052
volatile acidity       -0.390558
citric acid             0.226373
residual sugar          0.013732
chlorides              -0.128907
free sulfur dioxide    -0.050656
total sulfur dioxide   -0.185100
density                -0.174919
pH                     -0.057731
sulphates               0.251397
alcohol                 0.476166
Name: quality, dtype: float64


In [42]:
correlation = winedata.corr()
fig = plt.subplots(figsize = (10,10))
sns.heatmap(correlation, vmax=1, square=True, annot=True, cmap ='Oranges')

<AxesSubplot:>

In [43]:
#count of the target variable
sns.countplot(x='quality', data=winedata)

<AxesSubplot:xlabel='quality', ylabel='count'>

In [44]:
def get_features(correlation_threshold):
    abs_corrs = correlations.abs()
    high_correlations = abs_corrs[abs_corrs > correlation_threshold].index.values.tolist()
    return high_correlations

In [45]:
#taking features with correlation more than 0.05 as input x and quality as target y
features = get_features(0.05)
print(features)
x= winedata['quality']
y= winedata[features]

['fixed acidity', 'volatile acidity', 'citric acid', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']


In [46]:
for feature in features:
    plt.scatter(winedata[feature], y, label=feature)

plt.xlabel('Quality')
plt.ylabel('Features')
plt.legend()
plt.show()

ValueError: x and y must be the same size

#### Review Feature for Quality of Wine

In [None]:
#next we shall create a new column called Review. This column will contain the values of 1,2, and 3. 
#1 - Bad
#2 - Average
#3 - Excellent
#This will be split in the following way. 
#1,2,3 --> Bad
#4,5,6,7 --> Average
#8,9,10 --> Excellent
#Create an empty list called Reviews
reviews = []
for i in winedata['quality']:
    if i >= 1 and i <= 3:
        reviews.append('1')
    elif i >= 4 and i <= 7:
        reviews.append('2')
    elif i >= 8 and i <= 10:
        reviews.append('3')
winedata['Reviews'] = reviews

In [None]:
winedata.columns

In [None]:
winedata['Reviews'].unique()

In [None]:
from collections import Counter
Counter(winedata['Reviews'])

#### Split the x and y variable

In [None]:
x = winedata.iloc[:,:11]
y = winedata['Reviews']

In [None]:
#To plot graph for x and y
for feature in x.columns:
    plt.scatter(x[feature], y, label=feature)

plt.xlabel('Feature Values')
plt.ylabel('Reviews')
plt.legend()
plt.show()

#### Now scale the data using StandardScalar for PCA

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x = sc.fit_transform(x)

In [None]:
#view the scaled features
print(x)

#### Proceed to perform PCA

In [None]:
from sklearn.decomposition import PCA
pca = PCA()
x_pca = pca.fit_transform(x)

In [None]:
#plot the graph to find the principal components
plt.figure(figsize=(10,10))
plt.plot(np.cumsum(pca.explained_variance_ratio_), 'ro-')
plt.grid()

In [None]:
#AS per the graph, we can see that 8 principal components attribute for 90% of variation in the data. 
#we shall pick the first 8 components for our prediction.
pca_new = PCA(n_components=8)
x_new = pca_new.fit_transform(x)

In [None]:
print(x_new)

### Step 4: Train-Test Split

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x_new,y,random_state=4)

In [None]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

### Step 5: Model Selection
Proceed with Modelling by using Linear Regression

In [None]:
lr = LinearRegression()
lr.fit(x_train,y_train)

In [None]:
lr.coef_

In [None]:
lr_pred = lr.predict(x_train)
lr_pred = lr.predict(x_test)

### Step 6: Evaluation of Model

In [None]:
from sklearn import metrics

# Calculate and print Mean Absolute Error (MAE)
mae = metrics.mean_absolute_error(y_test, lr_pred)
formatted_mae = "{:.3f}".format(mae * 100)
print('Mean Absolute Error: {}%'.format(formatted_mae))

# Calculate and print Mean Squared Error (MSE)
mse = metrics.mean_squared_error(y_test, lr_pred)
formatted_mse = "{:.3f}".format(mse * 100)
print('Mean Squared Error: {}%'.format(formatted_mse))

# Calculate and print Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)
formatted_rmse = "{:.3f}".format(rmse * 100)
print('Root Mean Squared Error: {}%'.format(formatted_rmse))

In [None]:
from sklearn.metrics import r2_score

# Assuming you have y_test and lr_pred defined
r2_value = r2_score(y_test, lr_pred)

# Format the R-squared value to display with 3 decimal places using an f-string
formatted_r2_value = f"{r2_value:.3f}"

print(f"r2_Score is: {formatted_r2_value}%")

In [None]:
# Perform PCA
pca = PCA(n_components=8)
x_new = pca.fit_transform(x)

# Get the coefficients for the principal components
pc_attributes = [f'PC{i+1}' for i in range(8)]
coefficients = pd.DataFrame(lr.coef_, pc_attributes, columns=['Coefficient'])

In [None]:
coefficients