In [None]:

import pandas as pd
import ast
import numpy as np

# Preprocessing
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from category_encoders import TargetEncoder, HashingEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.sparse import hstack, csr_matrix, issparse, lil_matrix, save_npz, load_npz


In [None]:
# Converter of string lists into Python lists
# (e.g. "['a', 'b', 'c']" → [a, b, c])
def parse_list_col(s):
    return ast.literal_eval(s)

# Converter of 'N.V.' to 0, so column is numeric
def parse_vintage(s):
    return 0 if s == 'N.V.' else int(s)


base_path = '..\..\..\..\data\main'

In [None]:

# Load the train and test splits

wines = pd.read_csv(
    f'{base_path}\\XWines_Full_100K_wines.csv', 
    usecols=['WineID', 'Type', 'Elaborate', 'ABV', 'Body', 'Acidity', 'RegionName', 'WineryName', 'Grapes','Harmonize','Country'],
    converters={
        'Grapes':    parse_list_col,
        'Harmonize': parse_list_col
    }
)
train = pd.read_csv(
    f'{base_path}\\trainset.csv', 
    usecols=['UserID', 'WineID', 'Rating', 'Date', 'Vintage'],
    parse_dates=['Date'],
    date_format=lambda s: pd.to_datetime(s),
    converters={'Vintage': parse_vintage}
)
test_uwarm_iwarm = pd.read_csv(
    f'{base_path}\\testset_warm_user_warm_item.csv', 
    usecols=['RatingID', 'UserID', 'WineID', 'Rating', 'Date', 'Vintage'],
    parse_dates=['Date'],
    date_format=lambda s: pd.to_datetime(s),
    converters={'Vintage': parse_vintage}
)
test_uwarm_icold = pd.read_csv(
    f'{base_path}\\testset_warm_user_cold_item.csv', 
    usecols=['RatingID', 'UserID', 'WineID', 'Rating', 'Date', 'Vintage'],
    parse_dates=['Date'],
    date_format=lambda s: pd.to_datetime(s),
    converters={'Vintage': parse_vintage}
)
test_ucold_iwarm = pd.read_csv(
    f'{base_path}\\testset_cold_user_warm_item.csv', 
    usecols=['RatingID', 'UserID', 'WineID', 'Rating', 'Date', 'Vintage'],
    parse_dates=['Date'],
    date_format=lambda s: pd.to_datetime(s),
    converters={'Vintage': parse_vintage}
)
test_ucold_icold = pd.read_csv(
    f'{base_path}\\testset_cold_user_cold_item.csv', 
    usecols=['RatingID', 'UserID', 'WineID', 'Rating', 'Date', 'Vintage'],
    parse_dates=['Date'],
    date_format=lambda s: pd.to_datetime(s),
    converters={'Vintage': parse_vintage}
)

In [None]:
# Merge ratings with wines metadata on 'WineID'
train = train.merge(wines, on='WineID', how='left')
test_uwarm_iwarm = test_uwarm_iwarm.merge(wines, on='WineID', how='left')
test_uwarm_icold = test_uwarm_icold.merge(wines, on='WineID', how='left')
test_ucold_iwarm = test_ucold_iwarm.merge(wines, on='WineID', how='left')
test_ucold_icold = test_ucold_icold.merge(wines, on='WineID', how='left')

# Check the shapes
print(f"Train shape: {train.shape}")
print(f"Test warm user warm item shape: {test_uwarm_iwarm.shape}")
print(f"Test warm user cold item shape: {test_uwarm_icold.shape}")
print(f"Test cold user warm item shape: {test_ucold_iwarm.shape}")
print(f"Test cold user cold item shape: {test_ucold_icold.shape}")



## Preprocessing
### Preprocessing methods for different features:
* **Standard Scaler** - is used for numerical type columns
    * **ABV**
    * **Vintage** (formatted to be numerical)
    * **DaysAgo(Date)** - see below
* **One-hot-encoding** - is used for categorical features, but is limited by the number of categories within a feature:
    * **Type**
    * **Body**
    * **Acidity**
    * **Elaborate**
* **Multi-Label** - is used for categorical features with too many categories, where also multiple active categories could be possible:
    * **Grapes** (774 classes)
    * **Harmonize** (~64 classes)
* **Target Encoding** - used for text features and user IDs, wine IDs. Used with KFold(5 folds) to prevent data leakage, i.e. encoded feature never knows about it's own target value
* **Date encoding** - created custom object to convert datetime column to DaysAgo from the most recent record column. This way we keep information about time-related information and reduce feature to be simply numerical. **Standard Scaler** applied afterwards.


In [None]:

# Aggregate features

# Use StandardScaler for numerical features (create binary Is_NonVintage column derived from Vintage and maybe fill NaN values with 0)
numerical_features = ['ABV', 'Vintage']
# Use one-hot encoding for categorical features
categorical_features = ['Type', 'Elaborate', 'Body', 'Acidity']
# Use multilabel binarizer for multilabel features
multilabel_features = ['Grapes', 'Harmonize']
# Use target encoding for Country
targetencoder_features = ['Country']
# Use frequency encoder for IDs and high cardinality features
frequency_features = ['WineID', 'UserID', 'WineryName', 'RegionName']
# Use conversion to DaysAgo for time-based features
date_features = ['Date']



* **Create Preprocessing pipeline**:
    ##### **Important**: Since during Grapes column encoding we create 774 classes + there are around 100 classes from other encoders, the pandas DataFrame would require too much RAM (I recieved 69GB memory allocation error only for the Grapes column) and same happening for the dense array (numpy), I used csr_matrix from scipy.sparse and some additional optimizations for MultiLabelBinarizer in particular, to be able to store all the preprocessed features. More info in code below.

    * **Created custom object for Date column preprocessing**
    * **Created Wrappers for other preprocessors to always return sparse csr matricies**. For OneHotEncoding there is already implemented sparse output. For StandardScaler in date_pipeline wrapper is not required, since the input is already a csr matrix.
    * **Created pipelines for each preprocessor and a general pipeline to combine everything together, using ColumnTransformer** 



In [None]:

# Date preprocessor
class DateTransformer(BaseEstimator, TransformerMixin):
    """Transforms a single datetime column into 'days ago' relative to the latest date in training data."""
    
    def fit(self, X, y=None):
        # Expect a DataFrame with a single datetime column
        self.col = X.columns[0]
        self.column = f"{self.col}_days_ago"
        self.reference_date = pd.to_datetime(X[self.col]).max()
        return self

    def transform(self, X):
        days_ago = (self.reference_date - pd.to_datetime(X[self.col])).dt.days
        
        return csr_matrix(days_ago.values.reshape(-1, 1))

    def get_feature_names_out(self, input_features=None):
        return np.array([self.column])
    

class MultiLabelWrapper(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoders = {}
        self.feature_names = []
    
    def fit(self, X, y=None):
        self.feature_names = []
        for col in X.columns:
            mlb = MultiLabelBinarizer()
            # Fill missing with empty list for consistent fitting
            safe_col = X[col].apply(lambda x: x if isinstance(x, list) else [])
            mlb.fit(safe_col)
            self.encoders[col] = mlb
            self.feature_names.extend([f"{col}__{cls}" for cls in mlb.classes_])
        return self
            
    def transform(self, X):
        matricies = []
        for col in X.columns:
            mlb = self.encoders[col]
            class_index = {cls: i for i, cls in enumerate(mlb.classes_)}
            n_rows = len(X)
            n_classes = len(mlb.classes_)
            sparse = lil_matrix((n_rows, n_classes), dtype=np.uint8)

            for i, labels in enumerate(X[col]):
                # Handle missing or malformed entries
                if not isinstance(labels, list):
                    labels = []
                for label in labels:
                    if label in class_index:
                        sparse[i, class_index[label]] = 1

            matricies.append(sparse.tocsr())
        return hstack(matricies, format='csr')
    
    def get_feature_names_out(self, input_features=None):
        return np.array(self.feature_names)


class TargetEncoderWrapper(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoders = {}
        self.feature_names = []
    
    def fit(self, X, y):
        self.feature_names = []
        self.global_means = {}
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        
        for col in X.columns:
            te = TargetEncoder(cols=[col])
            te.fit(X[[col]], y, cv=kf)
            self.encoders[col] = te
            self.global_means[col] = y.mean() 
            self.feature_names.append(f'{col}_target_encoded')
        return self
    
    def transform(self, X):
        matricies = []
        for col in X.columns:
            te = self.encoders[col]
            df_encoded = te.transform(X[[col]])
            arr = df_encoded[col].fillna(self.global_means[col]).values.reshape(-1, 1)
            matricies.append(csr_matrix(arr))
        return hstack(matricies, format='csr')
    
    def get_feature_names_out(self, input_features=None):
        return np.array(self.feature_names)
    
    
class FrequencyEncoderWrapper(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.freq_maps = {}
        self.feature_names = []

    def fit(self, X, y=None):
        self.freq_maps = {}
        self.feature_names = [f"{col}_freq" for col in X.columns]
        for col in X.columns:
            self.freq_maps[col] = X[col].value_counts(normalize=True).to_dict()
        return self

    def transform(self, X):
        matrices = []
        for col in X.columns:
            freq = X[col].map(self.freq_maps[col]).fillna(0).values.reshape(-1, 1)
            matrices.append(csr_matrix(freq))
        return hstack(matrices, format='csr')

    def get_feature_names_out(self, input_features=None):
        return np.array(self.feature_names)


class StandardScalerWrapper(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.scaler = StandardScaler(with_mean=False)
        self.feature_names = []

    def fit(self, X, y=None):
        self.feature_names = X.columns.tolist()
        self.scaler.fit(X)
        return self

    def transform(self, X):
        X_scaled = self.scaler.transform(X)
        # Always return sparse csr_matrix
        if not issparse(X_scaled):
            X_scaled = csr_matrix(X_scaled)
        return X_scaled

    def get_feature_names_out(self, input_features=None):
        return np.array(self.feature_names)


In [None]:

## Preprocessing pipeline

# Numerical
numerical_pipeline = Pipeline([
    ('scaler', StandardScalerWrapper()),
    ])

# Categorical via one-hot-encoding
categorical_pipeline = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
])


# Categorical via MultiLabelBinarizer
multilabel_pipeline = Pipeline([
    ('multilabel', MultiLabelWrapper())
])

# Country via target encoding
target_pipeline = Pipeline([
    ('target', TargetEncoderWrapper())
])

# High cardinality categorical features via frequency encoding
# (i.e. WineryName, RegionName, UserID, WineID)
frequency_pipeline = Pipeline([
    ('frequency', FrequencyEncoderWrapper())
])

# Datetime via custom date transformer
date_pipeline = Pipeline([
    ('date', DateTransformer()),
    ('scaler', StandardScaler(with_mean=False))
])

# Preprocessor
# Remainnder contains RatingID column, which is not needed for training neither for testing
preprocessor = ColumnTransformer(transformers=[
    ('numerical', numerical_pipeline, numerical_features),
    ('categorical', categorical_pipeline, categorical_features),
    ('multilabel', multilabel_pipeline, multilabel_features),
    ('target', target_pipeline, targetencoder_features),
    ('frequency', frequency_pipeline, frequency_features),
    ('date', date_pipeline, date_features)
], remainder='drop')

preprocessing_pipeline = Pipeline([
    ('preprocessor', preprocessor)
])


In [None]:

# Drop target column for all datasets

X_train = train.drop(columns=['Rating'])
y_train = train['Rating']

X_test_uwarm_iwarm = test_uwarm_iwarm.drop(columns=['Rating'])
y_test_uwarm_iwarm = test_uwarm_iwarm['Rating']

X_test_uwarm_icold = test_uwarm_icold.drop(columns=['Rating'])
y_test_uwarm_icold = test_uwarm_icold['Rating']

X_test_ucold_iwarm = test_ucold_iwarm.drop(columns=['Rating'])
y_test_ucold_iwarm = test_ucold_iwarm['Rating']

X_test_ucold_icold = test_ucold_icold.drop(columns=['Rating'])
y_test_ucold_icold = test_ucold_icold['Rating']
from sklearn.model_selection import train_test_split

# Train/val split for hyperparameter tuning
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)




* **Fit preprocessing pipeline on training data. We pass target variable there for the Target Encoder**
* **Transform train and test sets on a fitted preprocessor**


In [None]:

# Fit the preprocessing pipeline
preprocessing_pipeline.fit(X_train, y_train)

X_train_transformed = preprocessing_pipeline.transform(X_train)
X_val_transformed = preprocessing_pipeline.transform(X_val)
X_test_uwarm_iwarm_transformed = preprocessing_pipeline.transform(X_test_uwarm_iwarm)
X_test_uwarm_icold_transformed = preprocessing_pipeline.transform(X_test_uwarm_icold)
X_test_ucold_iwarm_transformed = preprocessing_pipeline.transform(X_test_ucold_iwarm)
X_test_ucold_icold_transformed = preprocessing_pipeline.transform(X_test_ucold_icold)

# Save feature names
feature_names = preprocessing_pipeline.get_feature_names_out()
# Check the size of feature names and transformed data features 
print(f"Feature names size: {len(feature_names)}")
print(f"Transformed train data size: {X_train_transformed.shape[1]}")


In [None]:

# Save transformed data to npz
save_npz(f'{base_path}\\preprocessed\\X_train_transformed.npz', X_train_transformed)
save_npz(f'{base_path}\\preprocessed\\X_val_transformed.npz', X_val_transformed)
save_npz(f'{base_path}\\preprocessed\\X_test_uwarm_iwarm_transformed.npz', X_test_uwarm_iwarm_transformed)
save_npz(f'{base_path}\\preprocessed\\X_test_uwarm_icold_transformed.npz', X_test_uwarm_icold_transformed)
save_npz(f'{base_path}\\preprocessed\\X_test_ucold_iwarm_transformed.npz', X_test_ucold_iwarm_transformed)
save_npz(f'{base_path}\\preprocessed\\X_test_ucold_icold_transformed.npz', X_test_ucold_icold_transformed)
# Save target values to csv
y_train.to_csv(f'{base_path}\\preprocessed\\y_train.csv', index=False)
y_val.to_csv(f'{base_path}\\preprocessed\\y_val.csv', index=False)
y_test_uwarm_iwarm.to_csv(f'{base_path}\\preprocessed\\y_test_uwarm_iwarm.csv', index=False)
y_test_uwarm_icold.to_csv(f'{base_path}\\preprocessed\\y_test_uwarm_icold.csv', index=False)
y_test_ucold_iwarm.to_csv(f'{base_path}\\preprocessed\\y_test_ucold_iwarm.csv', index=False)
y_test_ucold_icold.to_csv(f'{base_path}\\preprocessed\\y_test_ucold_icold.csv', index=False)

