In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Wine Quality Prediction

Attributes:\
1 - fixed acidity\
2 - volatile acidity\
3 - citric acid\
4 - residual sugar\
5 - chlorides\
6 - free sulfur dioxide\
7 - total sulfur dioxide\
8 - density\
9 - pH\
10 - sulphates\
11 - alcohol\
12 - quality (score between 0 and 10)

# Import the libraries and dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats.mstats import winsorize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

In [None]:
df= pd.read_csv('/kaggle/input/wine-quality-dataset/WineQT.csv')

# Exploratory data analysis (EDA) 

In [None]:
df

In [None]:
df.dtypes

The dataset does not contain categorical values and only consists of numerical values.

In [None]:
df.shape

There are 1143 data entries encompassing 13 attributes. The **target** variable is **quality**, and the column labeled Id is unnecessary, resulting in 11 relevant input features.

Remove the variable, Id from the dataset.

In [None]:
df=df.drop('Id', axis=1)

**Check for duplicate entries**

In [None]:
duplicate_rows = df[df.duplicated()]
duplicate_rows

The output indicates that there are duplicate entries in the dataset. For example, the rows with index 1116 and 1119 are identical across all columns.

 Remove them. 

In [None]:
df = df.drop_duplicates()

**Check for missing values**

In [None]:
missing_values = df.isnull().sum()

# Display the count of missing values for each column
print("Missing values in each column:")
print(missing_values)

In [None]:
print("\nThere are no missing values in each column.")

In [None]:
df.shape

After eliminating duplicate entries and excluding the 'ID' variable, our dataset now consists of 1018 observations and 12 columns.

**Summarize the data**

In [None]:
summary_statistics = df.describe().transpose()
summary_statistics

**Check for outliers**

In [None]:
input_columns = df.columns[:-1].tolist()  # Exclude the last column

# Set up subplots
num_columns = len(input_columns)
num_rows = (num_columns + 1) // 2

fig, axes = plt.subplots(nrows=num_rows, ncols=2, figsize=(15, 5 * num_rows))
axes = axes.flatten()

# Plot box plots for each selected variable
for i, column in enumerate(input_columns):
    sns.boxplot(x=df[column], ax=axes[i])
    axes[i].set_title(column)
    axes[i].set_xticks([])  # Remove existing x-axis ticks
    axes[i].set_xticklabels([])  # Remove existing x-axis tick labels
    axes[i].tick_params(axis='x', rotation=90)  # Set the rotation angle

# Remove empty subplots, if any
for j in range(num_columns, num_rows * 2):
    fig.delaxes(axes[j])

# Adjust layout
plt.tight_layout()
plt.show()

Tukey's Method for identifying outliers and replace them with the mean

In [None]:
import pandas as pd
import numpy as np

# Assuming df is your DataFrame
input_columns = df.columns[:-1].tolist()  # Exclude the last column

# Function to replace outliers with mean using Tukey's Method
def replace_outliers_with_mean_tukey(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    data[column] = np.where((data[column] < lower_bound) | (data[column] > upper_bound), data[column].mean(), data[column])

# Replace outliers with mean using Tukey's Method for each numerical column
for column in input_columns:
    if np.issubdtype(df[column].dtype, np.number):  # Check if the column is numeric
        replace_outliers_with_mean_tukey(df, column)

Now, df contains the data with outliers replaced by the mean using Tukey's Method


**Check for correlation between variables in the dataset**

In [None]:
correlation_matrix = df.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

# Set up the matplotlib figure
plt.figure(figsize=(10, 8))

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(correlation_matrix, mask=mask, cmap='coolwarm', annot=True, fmt=".2f", vmin=-1, vmax=1)

plt.title('Correlation Heatmap')
plt.show()

There are strong correlations between variables in the dataset. For example, positive correlation between 'fixed acidity' and 'citric acid', a negative correlation between 'fixed acidity' and 'pH'.

I plan to employ tree-based ensemble methods to address the challenges associated with multicollinearity.

**Understand the target variable, quality**

In [None]:
y = df['quality']
sns.histplot(y, kde=True)  # KDE (Kernel Density Estimate) for smoothness
plt.title('Distribution of Quality')
plt.xlabel('Quality')
plt.ylabel('Frequency')
plt.show()

In the dataset, there are five quality levels ranging from 3 to 8.

The highest quality is represented by the rating 8, while the lowest quality corresponds to the rating 3.

# Defining the goal after EDA

Given that the target variable has distinct levels (quality ratings), this is a multi-classification problem. 

Each quality rating (3 to 8) represents a category, and the goal is to classify or predict the quality level for each instance in your dataset.

Distribution pattern of target

In [None]:
sns.countplot(x='quality', data=df)
plt.title('Distribution of Quality Levels')
plt.xlabel('Quality')
plt.ylabel('Count')
plt.show()

quality_counts = df['quality'].value_counts()

# Display the count of each unique value
print("Count of each unique value in 'quality':")
print(quality_counts)

SMOTE will be used to address class imbalance.

# Build the Tree-based Models

In [None]:
# 'quality' is the target column and X contains the input features
X = df.drop('quality', axis=1)
y = df['quality']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=999)

# Scale features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE to the training set
smote = SMOTE(sampling_strategy='auto', random_state=42, k_neighbors=3)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)


# Define a list of Tree-based classifiers
classifiers = {'CatBoost Classifier': CatBoostClassifier(verbose=False, random_state=999),
               'RF Classifier': RandomForestClassifier(n_estimators=100, random_state=999),
               'LightGBM Classifier': LGBMClassifier(n_estimators=100,random_state=999)}

# Train and evaluate each classifier
for name, clf in classifiers.items():
    clf.fit(X_train_resampled, y_train_resampled)
    y_pred = clf.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    rounded_accuracy = round(accuracy, 3)
    print(f'{name} Accuracy: {rounded_accuracy}')



# Summary

* Among the three tree-based models evaluated, the CatBoost Classifier exhibits the highest accuracy.