In [10]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [11]:
df = pd.read_csv('winequality-red-selected-missing.csv')

In [12]:
print(df.head())

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76          NaN             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  
1      9.8        5  
2      9.8        5 

In [13]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1388 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1405 non-null   float64
 8   pH                    1389 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB
None


In [14]:
# Check for missing values
print(df.isnull().sum())

fixed acidity             0
volatile acidity          0
citric acid             211
residual sugar            0
chlorides                 0
free sulfur dioxide       0
total sulfur dioxide      0
density                 194
pH                      210
sulphates                 0
alcohol                   0
quality                   0
dtype: int64


In [15]:
# Handle missing values with mean
df_filled = df.fillna(df.mean(numeric_only=True))

In [16]:
# Check again if the missing values have been fixed
print(df_filled.isnull().sum())

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64


In [17]:
print(df_filled.head())

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70     0.000000             1.9      0.076   
1            7.8              0.88     0.000000             2.6      0.098   
2            7.8              0.76     0.274791             2.3      0.092   
3           11.2              0.28     0.560000             1.9      0.075   
4            7.4              0.70     0.000000             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  
1      9.8        5  
2      9.8        5 

In [18]:
# Create the binary target variable 'is_good_quality'
df['is_good_quality'] = (df['quality'] >= 7).astype(int)

In [19]:
# Separate features (X) and target (y)
X = df.drop(['quality', 'is_good_quality'], axis=1)
y = df['is_good_quality']

In [20]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [21]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Shape of training data:", X_train_scaled.shape)
print("Shape of testing data:", X_test_scaled.shape)

Shape of training data: (1279, 11)
Shape of testing data: (320, 11)


In [22]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [23]:
# Initialize and train the XGBoost classifier
# Use 'objective="binary:logistic"' for binary classification
# Use 'eval_metric="logloss"' to evaluate performance on a log loss metric
xgb_model = xgb.XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    use_label_encoder=False,
    random_state=42
)

In [24]:
# Fit the model to the training data
xgb_model.fit(X_train_scaled, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [25]:
# Make predictions on the test set
y_pred = xgb_model.predict(X_test_scaled)
y_pred_proba = xgb_model.predict_proba(X_test_scaled)[:, 1]

In [26]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")


Model Accuracy: 0.9437


In [27]:
# Display a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Not Good', 'Good']))


Classification Report:
              precision    recall  f1-score   support

    Not Good       0.95      0.98      0.97       277
        Good       0.86      0.70      0.77        43

    accuracy                           0.94       320
   macro avg       0.91      0.84      0.87       320
weighted avg       0.94      0.94      0.94       320



In [28]:
# Display the confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Confusion Matrix:
[[272   5]
 [ 13  30]]


In [29]:
import joblib

# Assuming xgb_model and scaler are your trained model and scaler
joblib.dump(xgb_model, 'xgb_model.joblib')
joblib.dump(scaler, 'scaler.joblib')

print("Model and scaler saved as xgb_model.joblib and scaler.joblib")

Model and scaler saved as xgb_model.joblib and scaler.joblib
