In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import mlflow
from mlflow.tracking import MlflowClient
import pickle
from typing import Any, Tuple, Dict

In [2]:
def load_data(path: str) -> pd.DataFrame:
    """
    Load wine quality dataset from CSV file
    """
    df = pd.read_csv(path)
    print(f"Dataset shape: {df.shape}")
    return df

# Load the dataset
df = load_data('winequalityN.csv')
print("\nFirst few rows of the dataset:")
print(df.head())
print("\nDataset info:")
print(df.info())

Dataset shape: (6497, 13)

First few rows of the dataset:
    type  fixed acidity  volatile acidity  citric acid  residual sugar  \
0  white            7.0              0.27         0.36            20.7   
1  white            6.3              0.30         0.34             1.6   
2  white            8.1              0.28         0.40             6.9   
3  white            7.2              0.23         0.32             8.5   
4  white            7.2              0.23         0.32             8.5   

   chlorides  free sulfur dioxide  total sulfur dioxide  density    pH  \
0      0.045                 45.0                 170.0   1.0010  3.00   
1      0.049                 14.0                 132.0   0.9940  3.30   
2      0.050                 30.0                  97.0   0.9951  3.26   
3      0.058                 47.0                 186.0   0.9956  3.19   
4      0.058                 47.0                 186.0   0.9956  3.19   

   sulphates  alcohol  quality  
0       0.45      8

In [3]:
def prepare_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Prepare the wine dataset by handling missing values and removing duplicates
    """
    # Create a copy to avoid modifying the original dataframe
    df = df.copy()
    
    # Drop rows with missing values if any
    df = df.dropna()
    
    # Remove duplicates
    df = df.drop_duplicates()
    
    # Convert 'type' to numeric (0 for 'white', 1 for 'red')
    df['type'] = (df['type'] == 'red').astype(int)
    
    return df

# Prepare the data
df_cleaned = prepare_data(df)
print("Dataset shape after cleaning:", df_cleaned.shape)


Dataset shape after cleaning: (5295, 13)


In [4]:
def prepare_features_and_target(
    df: pd.DataFrame,
    target_column: str = 'quality'
) -> Tuple[pd.DataFrame, pd.Series]:
    """
    Separate features and target, and perform feature engineering
    """
    # Separate features and target
    X = df.drop(columns=[target_column])
    y = df[target_column]
    
    return X, y

def preprocess_data(
    X_train: pd.DataFrame,
    X_test: pd.DataFrame
) -> Tuple[pd.DataFrame, pd.DataFrame, StandardScaler]:
    """
    Preprocess the data using StandardScaler
    """
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, scaler

# Split features and target
X, y = prepare_features_and_target(df_cleaned)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Preprocess the data
X_train_scaled, X_test_scaled, scaler = preprocess_data(X_train, X_test)

print("Training set shape:", X_train_scaled.shape)
print("Testing set shape:", X_test_scaled.shape)

Training set shape: (4236, 12)
Testing set shape: (1059, 12)


In [5]:
def train_and_log_model(
    X_train: np.ndarray,
    X_test: np.ndarray,
    y_train: np.ndarray,
    y_test: np.ndarray,
    model_params: Dict[str, Any] = None
) -> Tuple[LinearRegression, Dict[str, float]]:
    """
    Train the model and log metrics using MLflow
    """
    # Set the experiment name
    mlflow.set_experiment("wine-quality-prediction")
    
    # Start MLflow run
    with mlflow.start_run() as run:
        # Log the parameters
        if model_params is None:
            model_params = {}
        mlflow.log_params(model_params)
        
        # Train the model
        model = LinearRegression(**model_params)
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)
        
        # Calculate metrics
        metrics = {
            'train_rmse': np.sqrt(mean_squared_error(y_train, y_pred_train)),
            'test_rmse': np.sqrt(mean_squared_error(y_test, y_pred_test)),
            'train_r2': r2_score(y_train, y_pred_train),
            'test_r2': r2_score(y_test, y_pred_test)
        }
        
        # Log metrics
        mlflow.log_metrics(metrics)
        
        # Log model
        mlflow.sklearn.log_model(model, "model")
        
        # Log feature names
        mlflow.log_param("features", list(X_train.columns) if hasattr(X_train, 'columns') else None)
        
        print(f"Run ID: {run.info.run_id}")
        print("\nMetrics:")
        for metric_name, metric_value in metrics.items():
            print(f"{metric_name}: {metric_value:.4f}")
        
        return model, metrics

# Train the model with MLflow tracking
model_params = {
    'fit_intercept': True,
    'n_jobs': -1
}

model, metrics = train_and_log_model(
    X_train_scaled,
    X_test_scaled,
    y_train,
    y_test,
    model_params
)

2025/01/06 15:56:55 INFO mlflow.tracking.fluent: Experiment with name 'wine-quality-prediction' does not exist. Creating a new experiment.


Run ID: b5581012364f4c0bb554d1662be1f771

Metrics:
train_rmse: 0.7153
test_rmse: 0.7863
train_r2: 0.3165
test_r2: 0.2941


In [6]:
def register_production_model(model_name: str = "wine_quality_predictor"):
    """
    Register the best model as a production model
    """
    client = MlflowClient()
    
    # Get the latest run
    runs = mlflow.search_runs(experiment_names=["wine-quality-prediction"])
    if len(runs) == 0:
        print("No runs found")
        return
    
    best_run = runs.iloc[0]
    run_id = best_run.run_id
    
    # Register the model
    model_uri = f"runs:/{run_id}/model"
    registered_model = mlflow.register_model(model_uri, model_name)
    
    # Transition the model to production stage
    client.transition_model_version_stage(
        name=model_name,
        version=registered_model.version,
        stage="Production"
    )
    
    print(f"Model {model_name} version {registered_model.version} is now in production")

# Register the model
register_production_model()


Successfully registered model 'wine_quality_predictor'.
2025/01/06 15:57:15 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: wine_quality_predictor, version 1


Model wine_quality_predictor version 1 is now in production


Created version '1' of model 'wine_quality_predictor'.


In [7]:
def load_production_model(model_name: str = "wine_quality_predictor"):
    """
    Load the production model from MLflow registry
    """
    return mlflow.sklearn.load_model(f"models:/{model_name}/Production")

def predict_wine_quality(model, X: np.ndarray) -> np.ndarray:
    """
    Make predictions using the loaded model
    """
    return model.predict(X)

# Load the production model
production_model = load_production_model()

# Make predictions on test set
predictions = predict_wine_quality(production_model, X_test_scaled)
print("\nSample predictions (first 5):")
print(predictions[:5])



Sample predictions (first 5):
[5.38939485 6.56960792 6.28820982 6.01342594 5.46243156]


In [8]:
def save_model_artifacts(model, scaler, path_prefix: str = ""):
    """
    Save model and scaler to disk
    """
    # Save model
    with open(f"{path_prefix}wine_model.pkl", "wb") as f:
        pickle.dump(model, f)
    
    # Save scaler
    with open(f"{path_prefix}wine_scaler.pkl", "wb") as f:
        pickle.dump(scaler, f)
    
    print("Model artifacts saved successfully")

# Save the artifacts
save_model_artifacts(model, scaler)


Model artifacts saved successfully


In [9]:
def interpret_model(model, feature_names):
    """
    Print feature coefficients for model interpretation
    """
    coefficients = pd.DataFrame(
        model.coef_,
        index=feature_names,
        columns=['Coefficient']
    )
    
    # Sort coefficients by absolute value
    coefficients['Abs_Coefficient'] = abs(coefficients['Coefficient'])
    coefficients = coefficients.sort_values('Abs_Coefficient', ascending=False)
    
    print("Feature Importance:")
    print(coefficients[['Coefficient']].round(4))

# Interpret the model
feature_names = X_train.columns if hasattr(X_train, 'columns') else [f"feature_{i}" for i in range(X_train.shape[1])]
interpret_model(model, feature_names)

Feature Importance:
                      Coefficient
alcohol                    0.3010
density                   -0.2389
volatile acidity          -0.2299
residual sugar             0.2133
type                       0.1264
sulphates                  0.1098
free sulfur dioxide        0.0953
total sulfur dioxide      -0.0820
fixed acidity              0.0765
pH                         0.0762
chlorides                 -0.0339
citric acid                0.0062
