In [None]:
from __future__ import annotations

from IPython.display import Markdown as md
from IPython.display import display, HTML

from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer, MinMaxScaler
from sklearn.linear_model import LogisticRegressionCV, LinearRegression
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import f1_score, accuracy_score, r2_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans

from statsmodels.tsa.seasonal import seasonal_decompose, STL

from scipy.stats import chi2_contingency

from typing import Literal, Union, List, Optional, Sequence, Tuple, Callable, Any, Dict

from google.colab import drive

import numpy as np
import pandas as pd
import seaborn as sns
import pyarrow.parquet as pq
import matplotlib.pyplot as plt

import plotly.express as px
import plotly.graph_objects as go

import holidays

import sys
import gc
import os

In [None]:
drive.mount("/content/drive/")
%cd "drive/MyDrive/DABI 2"

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/MyDrive/DABI 2


# Utility functions

In [1]:
#@title Function for printing logs, if needed
def _log_print(log: bool, content: str, content_type: Literal["header", "bullet", "text"], indent_layer: int = 0, newline: Union[None, Literal["start", "end"]] = None) -> None:
    if log:
        print(f"{chr(10) if newline == 'start' else ''}{''.join([chr(9)]) * indent_layer}{'- ' if content_type == 'bullet' else '--- ' if content_type == 'header' else ''}{content}{' ---' if content_type == 'header' else ''}{chr(10) if newline == 'end' else ''}")

NameError: name 'Literal' is not defined

In [None]:
#@title Function for chunking denormalized data (helps processing said data without frying Google Colab's RAM)
def process_data_by_chunks(  #Aufteilung der Daten, um Arbeitsspeicher zu schonen
        orders_path: str,
        orders_products_path: str,
        output_chunk_dir: str,
        percentage_per_chunk: float
) -> None:
    os.makedirs(output_chunk_dir, exist_ok=True) #Zielordenr anlegen

    orders_df_for_ids = pd.read_parquet("orders.parquet", columns=['user_id'])
    unique_user_ids = sorted(orders_df_for_ids['user_id'].unique())

    del orders_df_for_ids
    gc.collect()

    print(f"Found {len(unique_user_ids)} unique `user_ids` in total.")

    num_users = len(unique_user_ids) #Nutzer IDs in gleich große Chunks aufteilen
    users_per_chunk = max(1, int(num_users * percentage_per_chunk))
    user_id_chunks_sets = []

    for i in range(0, num_users, users_per_chunk):
        user_id_chunks_sets.append(set(unique_user_ids[i:i + users_per_chunk]))#Jede Teilmenge gleich als Set ablegen

    print(f"Define {len(user_id_chunks_sets)} user chunks with approx. {percentage_per_chunk * 100}% of users per chunk.")

    orders_products_read_chunk_size = 500000 #Chunk größe

    parquet_orders_reader = pq.ParquetFile(orders_path)

    for i, target_user_ids_set in enumerate(user_id_chunks_sets):
        current_chunk_number = i + 1
        print(f"\n--- Computing user chunk {current_chunk_number}/{len(user_id_chunks_sets)} ---")

        chunk_orders_data = []
        num_orders_row_groups = parquet_orders_reader.num_row_groups

        for rg_idx in range(num_orders_row_groups):
            rg_table = parquet_orders_reader.read_row_group(rg_idx, columns=["user_id", "order_id", "order_date"])
            rg_df = rg_table.to_pandas()

            filtered_orders_chunk = rg_df[rg_df.user_id.isin(target_user_ids_set)]
            if not filtered_orders_chunk.empty:
                chunk_orders_data.append(filtered_orders_chunk)

            del rg_df, rg_table
            gc.collect()

        if not chunk_orders_data:
            print(f"Could not find `orders` data for user chunk {current_chunk_number}. Skipping.")
            continue

        current_orders_df = pd.concat(chunk_orders_data, ignore_index=True)

        del chunk_orders_data
        gc.collect()

        print(f"\t- No. loaded `orders` for this chunk: {len(current_orders_df)} rows.")

        target_order_ids_for_chunk = set(current_orders_df.order_id.unique())

        chunk_orders_products_data = []
        for op_chunk_pd in pd.read_csv(orders_products_path, chunksize=orders_products_read_chunk_size):
            filtered_op_chunk = op_chunk_pd[op_chunk_pd.order_id.isin(target_order_ids_for_chunk)]

            if not filtered_op_chunk.empty:
                chunk_orders_products_data.append(filtered_op_chunk)

            del op_chunk_pd
            gc.collect()

        final_merged_chunk_df = None

        if chunk_orders_products_data:
            current_orders_products_df = pd.concat(chunk_orders_products_data, ignore_index=True)

            del chunk_orders_products_data
            gc.collect()

            print(f"  - No. loaded `orders_products` for this chunk: {len(current_orders_products_df)} rows.")

            print(f"  - Merging `orders` ({len(current_orders_df)} rows) with `orders_products` ({len(current_orders_products_df)} rows)...")
            final_merged_chunk_df = current_orders_df.merge(current_orders_products_df, on="order_id")

            del current_orders_products_df
            gc.collect()
        else:
            print(f"  - No 'orders_products' data for user chunk {current_chunk_number} has been found. Procceding with `orders` data.")
            final_merged_chunk_df = current_orders_df

        output_filepath = os.path.join(output_chunk_dir, f"{current_chunk_number}_orders_products_user_chunk.parquet")
        final_merged_chunk_df.to_parquet(output_filepath, index=False)
        print(f"  - Chunk {current_chunk_number} stored in {output_filepath}")

        del final_merged_chunk_df
        del current_orders_df
        gc.collect()

        print(f"  - Shared location for chunk {current_chunk_number}.")

    print("\nDone! All user chunks have been computed and stored.")

# Initial data load and chunking

In [None]:
process_data_by_chunks("orders.parquet", "order_products_denormalized.csv", "chunks", 0.1)

In [None]:
orders = pd.read_parquet("orders.parquet")
tip_temp = pd.read_csv("tip_testdaten_template_V2.csv", usecols=["order_id", "tip"])
tips = pd.read_csv("tips_public.csv", usecols=["order_id", "tip"])

In [None]:
tips["tip"] = tips.tip.astype(int)
orders_tips = orders.merge(tips).sort_values(["user_id", "order_date"])

# Train an autoregressive classifier

**Current objectives:**
- Create lag features
- Identify an appropriate splitting method (training/testing)
- Identify a suitable classifier

---

Creating the lag features (the **auto** part of autoregressive) is done as easily as said. We must, however, keep one very important thing in mind - not only are we provided with one timeseries, but rather one timeseries for every single user within the dataset. Meaning, creating the lag features requires one extra step. We must shift the tip data for each user by the number of lags we require, which is done by using a simple `groupby` for each lag.

Finding an appropriate way to split our data would not give us a hard time either. Randomly *train-test-*splitting our data will most likely lead to some unwanted behavior from our model. Thus, we chose the `TimeSeriesSplit`, which keeps the training data temporally earlier than the test data.

In general, probably most classifiers would work here, as we are only dealing with a binary classification. For the sake of simplicity, we have settled for `LogisticRegressionCV`.

In [None]:
#@title First function to train an AR(n) model
def train_ar(X: pd.DataFrame, lags: int, log_print: bool = True) -> float:
    """
    Trains and evaluates an Autoregressive (AR) model to predict users' tip probability
    based on historical tipping behavior.

    This function prepares the input data by creating lagged tip features,
    encodes user IDs, and performs time-series cross-validation to
    robustly evaluate model performance.

    Args:
        X (pd.DataFrame): Input DataFrame containing 'order_date', 'user_id', and 'tip' data.
                          'tip' is expected to be a binary (0 or 1) target variable.
        lags (int): The number of historical orders (lags) to include as features
                    in the autoregressive model (e.g., 1 for AR(1), 2 for AR(2)).

    Returns:
        float: The mean accuracy across all cross-validation folds,
               indicating the model's performance in predicting tip probability.
    """
    _log_print(log_print, f"Prepare training for AR({lags})", "header")
    # Create a copy of the DataFrame and select only the necessary columns
    # to avoid modifying the original DataFrame and reduce memory footprint.
    X_processed = X.copy()[["order_date", "user_id", "tip"]]

    # Sort data by user and then by order date to ensure correct lagging.
    X_processed.sort_values(["order_date", "user_id"], inplace=True)
    # Set 'order_date' as the index for time-series operations,
    # though TimeSeriesSplit primarily relies on positional indexing.
    X_processed.set_index("order_date", inplace=True)

    _log_print(log_print, "Begin preprocessing", "header", 1)
    # Generate shifted tip features. For each lag, create a new column
    # showing the tip status from a previous order for the same user.
    for lag in range(1, lags + 1):
        X_processed[f"tip_t-{lag}"] = X_processed.groupby("user_id").tip.shift(lag)

    # Drop rows that contain NaN values introduced by the shifting operation.
    X_processed.dropna(inplace=True)

    # Separate the target variable 'tip' from the features.
    y = X_processed.pop("tip")

    # Define the preprocessing steps for the features.
    preprocessor = ColumnTransformer(
        transformers=[
            ("user_encoder", OneHotEncoder(handle_unknown="ignore"), ["user_id"])
        ],
        remainder="passthrough"
    )

    # Initialize TimeSeriesSplit for robust cross-validation on time-series data.
    # This ensures that validation data always comes *after* training data chronologically,
    # preventing data leakage from the future.
    tscv = TimeSeriesSplit(n_splits=5)
    fold_scores = [] # List to store accuracy for each fold.

    # Iterate through each fold generated by TimeSeriesSplit.
    for i, (train_idx, test_idx) in enumerate(tscv.split(X_processed)):
        _log_print(log_print, f"Fold {i + 1}", "header", 1)

        # Split data into training and test sets based on TimeSeriesSplit indices.
        X_train, X_test = X_processed.iloc[train_idx], X_processed.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # Fit the preprocessor on the training data and transform both train/test sets.
        preprocessor.fit(X_train)
        X_train_transformed = preprocessor.transform(X_train)
        X_test_transformed = preprocessor.transform(X_test)

        _log_print(log_print, "Training", "bullet", 2)

        # Initialize and train the Logistic Regression model with built-in cross-validation.
        # 'cv=5' for internal CV, 'n_jobs=-1' to use all available CPU cores,
        # 'max_iter=1000' for convergence.
        model = LogisticRegressionCV(cv=5, n_jobs=-1, max_iter=1000)
        model.fit(X_train_transformed, y_train)

        # Make predictions on the transformed test set.
        y_pred = model.predict(X_test_transformed)

        _log_print(log_print, "Testing", "bullet", 2)
        # Calculate accuracy for the current fold and append to the list.
        accuracy = accuracy_score(y_test, y_pred)
        fold_scores.append(accuracy)
        _log_print(log_print, f"Accuracy for fold {i + 1}: {accuracy:.4f}", "bullet", 2, newline="end")

    # Return the average accuracy across all cross-validation folds.
    return np.mean(fold_scores)

In [None]:
acc_ar_1 = train_ar(orders_tips, 1)
acc_ar_2 = train_ar(orders_tips, 2)
df_performance = pd.DataFrame({"Accuracy": [acc_ar_1, acc_ar_2]}, index=["AR(1)", "AR(2)"])

--- Prepare training for AR(1) ---
	--- Begin preprocessing ---
	--- Fold 1 ---
		- Training
		- Testing
		- Accuracy for fold 1: 0.7342

	--- Fold 2 ---
		- Training
		- Testing
		- Accuracy for fold 2: 0.7388

	--- Fold 3 ---
		- Training
		- Testing
		- Accuracy for fold 3: 0.7314

	--- Fold 4 ---
		- Training
		- Testing
		- Accuracy for fold 4: 0.7311

	--- Fold 5 ---
		- Training
		- Testing
		- Accuracy for fold 5: 0.7314

--- Prepare training for AR(2) ---
	--- Begin preprocessing ---
	--- Fold 1 ---
		- Training
		- Testing
		- Accuracy for fold 1: 0.7353

	--- Fold 2 ---
		- Training
		- Testing
		- Accuracy for fold 2: 0.7414

	--- Fold 3 ---
		- Training
		- Testing
		- Accuracy for fold 3: 0.7486

	--- Fold 4 ---
		- Training
		- Testing
		- Accuracy for fold 4: 0.7343

	--- Fold 5 ---
		- Training
		- Testing
		- Accuracy for fold 5: 0.7502



In [None]:
df_performance

Unnamed: 0,Accuracy
AR(1),0.733377
AR(2),0.741966


In [None]:
acc_ar_1 = 0.25489
acc_ar_2 = 1.36384

**Interpretation**

As it is apparent, the current AR model will correctly classify the user's tip behavior for their next order in about $ 73.34\% $ of all
cases, given the information about whether or not they tipped on their previous order. If this model, however, were to be given information regarding
the users' tip behaviors for their most recent two orders, it will classify about $ 74.2\% $ of all users' tip behavior for their next order correctly.
Now begs the question, how far we can go back in our customers' order history, in order to make our predictions more accurate. The answer lies within the behavioral
patterns of our customers' tipping behaviors. In order to understand these, we will analyze and interpret the (partial) autocorrelations of our data at hand.

In [None]:
#@title Function for compiling the ACF
def compute_acf_improved(df: pd.DataFrame, lags: int) -> pd.Series:
    """
    Computes the autocorrelation function (ACF) for the tips of each user.

    Args:
        df (pd.DataFrame): DataFrame containing at least the colums 'tip' and
                            'user_id'.
        lags (int): The number of historical orders (lags) to include as features
                    in the autoregressive model.

    Returns:
        pd.Series: Series containing the autocorrelation coefficients
    """
    df_copy = df.copy()

    # Generate shifted features
    for lag_val in range(1, lags + 1):
        df_copy[f"tip_t-{lag_val}"] = df_copy.groupby("user_id").tip.shift(lag_val)

    df_copy.dropna(inplace=True)

    # Only select tip column and the shifted features for computation
    cols_to_correlate = ["tip"] + [f"tip_t-{lag_val}" for lag_val in range(1, lags + 1)]

    final_df = df_copy[df_copy.columns.intersection(cols_to_correlate)]

    # Compute correlation
    acf_results = final_df.corr()["tip"]

    # Drop row with index "tip" as it is redundant
    # Rename indexes to address data points via their corresponding lag
    return acf_results.drop("tip").rename(index=lambda x: int(x.split('-')[1]))

In [None]:
#@title Function for computing the PACF
def compute_pacf_improved(df: pd.DataFrame, lags: int) -> dict:
    """
    Compute the partial autocorrelation function (PACF) per user.
    Lags are created on users' base to avoid computation across
    "user limits".

    Args:
        df (pd.DataFrame): DataFrame containing at least the colums 'tip' and
                            'user_id'.
        lags (int): The number of historical orders (lags) to include as features
                    in the autoregressive model.

    Returns:
        dict: A dictionary containing the PACF coefficients
    """
    df_copy = df.copy()
    df_copy.sort_values(["user_id", "order_date"], inplace=True)

    # Generate shifted features
    for l in range(1, lags + 1):
        df_copy[f"tip_t-{l}"] = df_copy.groupby("user_id").tip.shift(l)

    df_copy.dropna(inplace=True)

    pacf_vals = {}

    for lag in range(1, lags + 1):
        if lag == 1:
            # PACF(1) = ACF(1)
            corr_val = df_copy["tip"].corr(df_copy['tip_t-1'])
            if not np.isnan(corr_val):
                pacf_vals[lag] = corr_val
        else:
            # For PACF(k) regress Y_t on Y_{t-1}, ..., Y_{t-k+1}
            # and Y_{t-k} on Y_{t-1}, ..., Y_{t-k+1}
            # Then, correlate residuals
            X_prime_cols = [f"tip_t-{l}" for l in range(1, lag)]

            required_cols = ["tip", f"tip_t-{lag}"] + X_prime_cols

            # Select relevant columns only
            subset_for_lag = df_copy[df_copy.columns.intersection(required_cols)]

            # Sicherstellen, dass subset_for_lag nicht leer ist nach intersection
            if subset_for_lag.empty or len(subset_for_lag) < 2:  # Mindestens 2 Punkte für Korrelation
                pacf_vals[lag] = np.nan  # Keine Berechnung möglich
                continue

            # Regression von Y_t auf Y_{t-1}, ..., Y_{t-k+1}
            model_y = LinearRegression()
            model_y.fit(subset_for_lag[X_prime_cols], subset_for_lag["tip"])
            residuals_y = subset_for_lag["tip"] - model_y.predict(subset_for_lag[X_prime_cols])

            # Regression von Y_{t-k} auf Y_{t-1}, ..., Y_{t-k+1}
            model_xk = LinearRegression()
            model_xk.fit(subset_for_lag[X_prime_cols], subset_for_lag[f"tip_t-{lag}"])
            residuals_xk = subset_for_lag[f"tip_t-{lag}"] - model_xk.predict(subset_for_lag[X_prime_cols])

            # Korrelation der Residuen
            # Muss nan-Werte handhaben, falls sie durch die Regression entstehen
            # np.corrcoef handhabt NaN, gibt NaN zurück, wenn alle nan sind
            correlation = np.corrcoef(residuals_y, residuals_xk)[0, 1]
            if not np.isnan(correlation):
                pacf_vals[lag] = correlation
            else:
                pacf_vals[lag] = np.nan  # Falls Korrelation NaN ist

    return pacf_vals

As previously mentioned, we'll make use of the autocorrelation function (ACF) and the partial autocorrelation function (PACF), in order to understand
behavioral patterns in tipping across our customers. These metrics help us analyze how much past events (previous tips) influence current tipping behavior.

---
### What the functions do
1. `compute_acf`: This function calculates the raw correlations between the current tip value (`tip_t`) and its lagged versions (e.g., `tip_t-1`, `tip_t-2`,
$\dots$). It does so separately for each user and then combines the results to compute the overall correlation matrix.
- The ACF answers:
>"How much does tipping today relate to tipping $1$, $2$, $\dots$, $k$ steps ago?"
2. `compute_pacf`: This function calculates the direct effect of a previous tip
(e.g., `tip_t-k`) on the current tip (`tip_t`) while controlling for the intermediate lags (`tip_t-1` to `tip_t-k+1`).
- For lag $1$, it simply computes the direct correlation between `tip` and `tip_t-1` (i.e. the ACF).
- For higher lags, it uses a linear regression model to isolate the residual part of the current tip and then calculates how much that residual still correlates with `tip_t-k`.
- PACF answers:
>"Does tipping two (or more) steps ago have a direct influence on current tipping - beyond what can be explained by more recent behavior?"

---
### Why this matters
- ACF and PACF give us insight into behavioral dependencies over time:
    - A high lag $1$ PACF would suggest habitual tipping behavior (e.g., users who tipped last time tend to tip again).
    - A low or negative lag $2+$ PACF might suggest reacionary behavior or that tipping decisions are not strongly dependent on older history.
- These insights help us:
    - Better model user behavior in prediction tasks
    - Understand the memory effect in customer actions
    - Identify opportunities for intervention (e.g., personalized nudges for users who previously tipped)

In [None]:
acf = compute_acf_improved(orders_tips, 10)
pacf = compute_pacf_improved(orders_tips, 10)

In [None]:
pacf

{1: np.float64(0.4589361534314325),
 2: np.float64(0.2336097673439864),
 3: np.float64(0.17548776559700544),
 4: np.float64(0.1440740969825668),
 5: np.float64(0.1191018616551626),
 6: np.float64(0.10383451070207256),
 7: np.float64(0.08997403679081724),
 8: np.float64(0.08224974773295454),
 9: np.float64(0.07258165082121364),
 10: np.float64(0.06694395205541036)}

In [None]:
df_acf = pd.DataFrame({"Lag": acf.index, "ACF": acf.values}).iloc[1:]

In [None]:
#@title Function for plotting the (P)ACF
def plot_corr(df: pd.DataFrame, func: Literal["ACF", "PACF"], title: str) -> None:
    fig = go.Figure()

    # 1. Vertikale Linien
    for i, row in df.iterrows():
        fig.add_trace(go.Scatter(
            x=[row["Lag"], row["Lag"]],
            y=[0, row[func]],
            mode="lines",
            line=dict(color='blue', width=1),
            showlegend=False
        ))

    # 2. Punkte
    fig.add_trace(go.Scatter(
        x=df["Lag"],
        y=df[func],
        mode="markers",
        marker=dict(size=8, color='blue'),
        name="Coefficient"
    ))

    # Layout
    fig.update_layout(
        title=title,
        xaxis_title="Lag",
        xaxis_type="category",
        yaxis_title=func,
        template="plotly_dark"
    )

    fig.show()
    fig.write_html(f"{func.lower()}.html")

In [None]:
plot_corr(df_acf, "ACF", "Autocorrelation Function for 10 lags")

In [None]:
plot_corr(pd.DataFrame({"Lag": pacf.keys(), "PACF": pacf.values()}), "PACF", "Partial Autocorrelation Function for 10 lags")

In [None]:
acc_ar_3 = train_ar(orders_tips, 3)
acc_ar_4 = train_ar(orders_tips, 4)

df_performance.loc["AR(3)"] = acc_ar_3
df_performance.loc["AR(4)"] = acc_ar_4

In [None]:
df_performance

Unnamed: 0,Accuracy
AR(1),0.733377
AR(2),0.741966
AR(3),0.738908
AR(4),0.742509


In [None]:
display(df_to_table(df_performance, round_floats=4))

md(
    f"**Interpretation**\
    \nConsidering that our current main objective is to improve an AR model, our results are rather untypical for an AR-process.\
    \nThe ACF of an AR process usually shows a cutoff, which we don't seem to have in this case. Typically, we'd expect an exponential\
    or a geometrical decrease of the correlation coefficients. Instead, we are presented a persistant, high correlation across all lags.\
    This might indicate that we are could be dealing with trend-based data.\
    \nThe PACF plot is what helps us determine which order might be the best fit for our AR model. The most notable cutoff is occurring\
    for a lag of $2$, however, the lags $3$ and $4$ might be a good fit for our model, too. Any other lag beyond $4$ doesn't seem to\
    be providing plenty of additional information.\
    \n\n---\
    \nAccuracy of $AR(3)\colon{(acc_ar_3 * 100):.2f}\%$; Accuracy of $AR(4)\colon{(acc_ar_4 * 100):.2f}\%$\
    \nThus, an AR({ {acc_ar_3: 3, acc_ar_4: 4}[max(acc_ar_3, acc_ar_4)]}) model seems to be the best choice, at least for now."
)

Unnamed: 0,Accuracy
AR(1),0.7334
AR(2),0.742
AR(3),0.7389
AR(4),0.7425


**Interpretation**    
Considering that our current main objective is to improve an AR model, our results are rather untypical for an AR-process.    
The ACF of an AR process usually shows a cutoff, which we don't seem to have in this case. Typically, we'd expect an exponential    or a geometrical decrease of the correlation coefficients. Instead, we are presented a persistant, high correlation across all lags.    This might indicate that we are could be dealing with trend-based data.    
The PACF plot is what helps us determine which order might be the best fit for our AR model. The most notable cutoff is occurring    for a lag of $2$, however, the lags $3$ and $4$ might be a good fit for our model, too. Any other lag beyond $4$ doesn't seem to    be providing plenty of additional information.    

---    
Accuracy of $AR(3)\colon73.89\%$; Accuracy of $AR(4)\colon74.25\%$    
Thus, an AR(4) model seems to be the best choice, at least for now.

## Trend and seasonalities

First and foremost, we need to find out whether the data contains trend and seasonalities.

In [None]:
otc = orders_tips.copy()
otc.sort_values("order_date", inplace=True)

In [None]:
otc["date"] = pd.to_datetime(otc.order_date.dt.date)  # Extraction of date from order_date

otc["day_name"] = otc.order_date.dt.day_name()        # Extraction of Dayname(Day of the week)
otc.day_name = pd.Categorical(otc.day_name, categories=["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"], ordered=True)

otc["calendar_week"] = otc.order_date.dt.year.astype(str) + "-W" + otc.order_date.dt.isocalendar().week.astype(str).str.zfill(2)  # Extraction of week of the year in combination with year (YYYY-Www)
otc["hour"] = otc.order_date.dt.hour  # Extraction of hour of day

Bar plot with average tip rate per weekday

In [None]:
px.bar(
    otc.groupby("day_name", observed=False).tip.mean(),
    y="tip",
    labels={"day_name": "Weekday", "tip": "Tip rate"},
    title="Tip rate per weekday",
    template="plotly_dark"
)

Bar plot with average tip rate per hour of day

In [None]:
px.bar(
    otc.groupby("hour").tip.mean(),
    y="tip",
    labels={"hour": "Hour", "tip": "Tip rate"},
    title="Tip rate per hour",
    template="plotly_dark"
)

Time Series plot with average tip rate trend over time

In [None]:
px.line(
    #Group by date, calculate the average tip per day and assign the corresponding weekday to each date
    otc.groupby("date", as_index=False).tip.mean().assign(day_name=pd.to_datetime(otc.date.drop_duplicates()).dt.day_name().values),
    x="date",
    y="tip",
    labels={"date": "Date", "tip": "Tip rate"},
    hover_data={"day_name": True},
    template="plotly_dark",
    title="Tip rate over time"
)

Time Series Line Plot by Calendar Week with average tip rate per week over time

In [None]:
px.line(
    otc.groupby("calendar_week", as_index=False).tip.mean(),
    x="calendar_week",
    y="tip",
    labels={"calendar_week": "Calendar week", "tip": "Tip rate"},
    title="Tip rate over time",
    template="plotly_dark"
)

Now, we have figured out a few things. For starters, the data definitely contains a weekly seasonality. Considering a potential horly seasonality does not seem like a bad idea either. Finally, there is also a trend, which becomes much more apparent when looking at the weekly tip rate over time, instead of the overall tip rate over time. To make sure we identified the trend as well as the seasonalities correctly, we are now going to find statistical proof for their existence within our data.

In [None]:
# Weekly seasonality
print("Weekly seasonality")
contingency_table_weekly = pd.crosstab(otc.day_name, otc.tip)               # test for correlation between tip behavior and weekday using chi² test
chi2_weekly, p_weekly, _, _ = chi2_contingency(contingency_table_weekly)

df_daily = otc.set_index("order_date").resample("D").tip.mean()             # test for correlation between tip behavior and weekday using autocorrelation with lag 7 for weekly patterns
print(f"- p-value: {p_weekly}\nAutocorrelation (lag=7): {df_daily.autocorr(lag=7):.2f}")

# Hourly seasonality
print("\nHourly seasonality")                                               # test for correlation between tip behavior and hour of day using chi² test
contingency_table_hourly = pd.crosstab(otc.hour, otc.tip)
chi2_hourly, p_hourly, _, _ = chi2_contingency(contingency_table_hourly)

df_hourly = otc.set_index("order_date").resample("h").tip.mean()            # test for correlation between tip behavior and hour of day using autocorrelation with lag 24 for daily patterns
print(f"- p-value: {p_hourly}\nAutocorrelation (lag=24): {df_hourly.autocorr(lag=24):.2f}")

Weekly seasonality
- p-value: 0.0
Autocorrelation (lag=7): 0.93

Hourly seasonality
- p-value: 0.0
Autocorrelation (lag=24): 0.23


**Interpretation**

In both cases, the $\chi^2$ test returns a *p*-value of roughly $0.0$, indicating a dependency which is very unlikely to be occurring by chance.

For the weekly seasonality, the AC coefficient is about equal $0.93$, which tells us, that the tip rate strongly depends on the tip rate from last week, which we can assume may be true for any point of time within the dataset, except for the earliest timestamps.

Looking at the hourly tip rate on the other hand, we receive an AC coefficient of roughly $0.23$, basically indicating the slight opposite of what an AC coefficient of, let's say $0.8$ or up, would indicate. While this does certainly not mean that there is no information to be gained at all through this, it actually means that the amount of information gained is not impressively high, in fact, an AC coefficient of $0.23$ means just about $ 5.3\% $ of variance explained.

Let's find out whether looking at the _"weekly-hourly"_ tip rate would make a difference

In [None]:
otc['weekday_hour'] = otc['order_date'].dt.strftime('%A_%H')     # combining weekday with hour of day as a new feature

contingency_table_weekly_hourly = pd.crosstab(otc.weekday_hour, otc.tip)
chi2_weekly_hourly, p_weekly_hourly, _, _ = chi2_contingency(contingency_table_weekly_hourly)

weekly_autocorrelation = df_hourly.autocorr(lag=168)            # Calculate autocorrelation with a lag of 168 hours (hours in 1 week)
print(f"- p-value: {p_weekly_hourly}\nAutocorrelation (lag=168): {(weekly_autocorrelation):.2f}")

- p-value: 0.0
Autocorrelation (lag=168): 0.39


While an AC coefficient of $ 0.39 $ does not seem that much better, it is still evident that the correlation is stronger and results in a variance explained of $ 15.21\% $. Certainly not an impressive percentage, but an improvement for sure.

resample of the df otc per hour and calculate the average tip per hour

In [None]:
otc.set_index("order_date").resample("h").tip.mean()

Unnamed: 0_level_0,tip
order_date,Unnamed: 1_level_1
2024-01-17 00:00:00,0.5
2024-01-17 01:00:00,
2024-01-17 02:00:00,
2024-01-17 03:00:00,
2024-01-17 04:00:00,
...,...
2025-04-18 15:00:00,1.0
2025-04-18 16:00:00,
2025-04-18 17:00:00,
2025-04-18 18:00:00,0.0


In [None]:
df = otc.groupby("date", as_index=False).tip.mean().sort_values("date").set_index("date").sort_index()

delete dataframe and free up memory

In [None]:
del otc
gc.collect()

9879

## Expanding the model

Now, it would be about time to actually make some use of the intel we just gained. To sum things up, we found out that the timeseries we are working with contains a trend and seasonalities. The issue with this is that this timeseries is not stationary, which can cause any to misunderstand the training data, as it will, most likely, learn incorrect patterns, since properties like **variance** or **mean** do not remain constant over time.

While there are a number of ways to deal with trends and seasonalities, we have settled for the following techniques:

- Trend: Encode the temporal progression directly as a numerical variable.
- Seasonalities: Encode the cyclical patterns using sin/cos transformation.

In [None]:
class SinCosTransformer(BaseEstimator, TransformerMixin):
    """
    A scikit-learn compatible transformer that converts numerical features
    into their sine and cosine components. This is particularly useful for
    capturing cyclical patterns (e.g., time of day, day of week) in data
    where the absolute value is less important than its position within a cycle.

    For each input feature, it generates two new features:
    - `original_feature_name_sin`: The sine transformation of the feature.
    - `original_feature_name_cos`: The cosine transformation of the feature.

    The transformation is calculated as:
    sin_component = sin((2 * pi * value) / period)
    cos_component = cos((2 * pi * value) / period)

    Attributes:
        period (int): The length of the cycle for the data. For example, 24 for
                      hours in a day, 7 for days in a week.
    """
    def __init__(self, period: int) -> None:
        """
        Initializes the SinCosTransformer.

        Args:
            period (int): The cycle length used for the sine and cosine transformations.
        """
        self.period = period

    def fit(self, X: pd.DataFrame, y: Optional[Union[pd.Series, pd.DataFrame]] = None) -> "SinCosTransformer":
        """
        This method is a placeholder and does not perform any fitting.
        It simply returns the transformer instance itself, as the transformation
        is stateless and only depends on the 'period' attribute.

        Args:
            X (pd.DataFrame): The input data. Not used in fit, but required for
                              scikit-learn API compatibility.
            y (Optional[Union[pd.Series, pd.DataFrame]]): Target variable.
                                                            Not used.

        Returns:
            SinCosTransformer: The instance of the transformer.
        """
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Transforms the input features by calculating their sine and cosine components.
        For each numerical column in the input DataFrame, two new columns are created:
        one for the sine transformation and one for the cosine transformation.

        Args:
            X (pd.DataFrame | pd.Series | np.ndarray): The input data containing
                                                         numerical features to be transformed.
                                                         If a Series or NumPy array, it will
                                                         be converted to a DataFrame.

        Returns:
            pd.DataFrame: A new DataFrame containing only the newly generated
                          sine (`_sin`) and cosine (`_cos`) features.
                          Original features are dropped.

        Raises:
            TypeError: If the input `X` is not a pandas DataFrame, Series, or NumPy array.
        """
        # Ensure X is a DataFrame for consistent processing
        if isinstance(X, (pd.Series, np.ndarray)):
            # Convert Series or NumPy array to DataFrame, preserving column names if possible
            if isinstance(X, pd.Series):
                X_transformed = pd.DataFrame(X.copy())
            else: # np.ndarray
                # If it's a 1D array, treat it as a single column
                if X.ndim == 1:
                    X_transformed = pd.DataFrame(X, columns=['feature'])
                else:
                    X_transformed = pd.DataFrame(X)
        elif isinstance(X, pd.DataFrame):
            X_transformed = X.copy()
        else:
            raise TypeError("Input X must be a pandas DataFrame, Series, or NumPy array.")

        # Iterate over each column in the (potentially copied) DataFrame
        for col in X_transformed.columns:
            # Calculate the sine component for the current column
            # The formula (2 * pi * value) / period scales the values to
            # a range suitable for sine/cosine functions (0 to 2*pi)
            X_transformed[f"{col}_sin"] = np.sin((2 * np.pi * X_transformed[col]) / self.period)
            # Calculate the cosine component for the current column
            X_transformed[f"{col}_cos"] = np.cos((2 * np.pi * X_transformed[col]) / self.period)

        # Return only the newly created sine and cosine columns.
        # `filter(regex="_sin|_cos")` efficiently selects columns ending with these suffixes.
        return X_transformed.filter(regex="_sin|_cos")

In [None]:
def feature_preprocessing(df: pd.DataFrame, lags: int, min_date_global: pd.Timestamp = None, log_print: bool = True) -> Tuple[pd.DataFrame, ColumnTransformer]:
    df_copy = df.copy()

    _log_print(log_print, "Begin Feature Preprocessing", "header")

    if min_date_global is None:
        min_date_global = df_copy["order_date"].min()

    _log_print(log_print, "Generate temporal feature", "bullet", 1)

    df_copy["days_since_start"] = (df_copy["order_date"] - min_date_global).dt.days
    df_copy["weekday"] = df_copy["order_date"].dt.dayofweek

    df_copy.sort_values(by=["user_id", "order_date"], inplace=True)

    df_copy["is_target_nan"] = df_copy.tip.isna()

    _log_print(log_print, "Preparing Data for Shifting", "bullet", 1)
    if df_copy.tip.isna().any():
        df_copy["tip_for_shifting"] = df_copy.groupby("user_id").tip.ffill()
        df_copy["tip_for_shifting"] = df_copy.groupby("user_id").tip_for_shifting.bfill()
    else:
        df_copy["tip_for_shifting"] = df_copy["tip"]

    shifted_tip_features = []
    shifted_nan_indicators = []

    _log_print(log_print, "Generating shifted Features", "bullet", 1)
    for lag in range(1, lags + 1):
        lag_feature_name = f"tip_t-{lag}"
        nan_indicator_name = f"tip_t-{lag}_is_nan"

        df_copy[lag_feature_name] = df_copy.groupby("user_id").tip_for_shifting.shift(lag)
        df_copy[nan_indicator_name] = df_copy[lag_feature_name].isna().astype(int)

        df_copy[lag_feature_name] = df_copy[lag_feature_name].fillna(-1)

        shifted_tip_features.append(lag_feature_name)
        shifted_nan_indicators.append(nan_indicator_name)

    df_copy.drop(columns=["user_id"], inplace=True)

    numerical_features_passthrough = shifted_nan_indicators + shifted_tip_features + ["days_since_start"]

    _log_print(log_print, "Initializing preprocessor", "bullet", 1, "start")
    preprocessor = ColumnTransformer(
        transformers=[
            ("weekly", SinCosTransformer(period=7), ["weekday"]),
            ("nan_indicators", "passthrough", numerical_features_passthrough)
        ],
        remainder="drop"
    )

    _log_print(log_print, "Feature Preprocessing finished successfully!", "bullet", 1)

    return df_copy, preprocessor

In [None]:
def train_ar_extended(df_input: pd.DataFrame, lags: int, log_print: bool = True) -> Tuple[LogisticRegressionCV, ColumnTransformer, pd.Timestamp, float]:
    df_input = df_input.copy()

    min_date_global_training = df_input.order_date.min()

    df_with_all_features, _ = feature_preprocessing(df_input, lags, min_date_global=min_date_global_training)

    del _
    gc.collect()

    df_cleaned_for_training = df_with_all_features[~df_with_all_features["is_target_nan"]].copy()

    min_date_for_pred = df_with_all_features.order_date.min()

    del df_with_all_features
    gc.collect()

    df_cleaned_for_training.set_index("order_date", inplace=True)

    y = df_cleaned_for_training.pop("tip")

    numerical_features_passthrough = [f"tip_t-{lag}_is_nan" for lag in range(1, lags + 1)] + [f"tip_t-{lag}" for lag in range(1, lags + 1)] + ["days_since_start"]
    temporal_features = ["weekday"]

    X_processed = df_cleaned_for_training[numerical_features_passthrough + temporal_features]

    del df_cleaned_for_training
    gc.collect()

    fold_acc_scores = []
    tscv = TimeSeriesSplit(n_splits=5)
    for i, (train_idx, test_idx) in enumerate(tscv.split(X_processed)):
        _log_print(log_print, f"Current fold: {i + 1}", "header", newline="start")
        X_train, X_test = X_processed.iloc[train_idx], X_processed.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        del train_idx, test_idx
        gc.collect()

        _log_print(log_print, "Preprocessing", "bullet", 1)
        current_preprocessor = ColumnTransformer(
            transformers=[
                ("weekly", SinCosTransformer(period=7), ["weekday"]),
                ("nan_indicators", "passthrough", numerical_features_passthrough)
            ], remainder="drop"
        )
        current_preprocessor.fit(X_train)

        X_train_transformed = current_preprocessor.transform(X_train)
        X_test_transformed = current_preprocessor.transform(X_test)

        del current_preprocessor, X_train, X_test
        gc.collect()

        _log_print(log_print, "Training", "bullet", 1)
        current_model = LogisticRegressionCV(cv=5, n_jobs=-1, max_iter=1000)
        current_model.fit(X_train_transformed, y_train)

        del X_train_transformed, y_train
        gc.collect()

        _log_print(log_print, "Testing", "bullet", 1)
        y_pred = current_model.predict(X_test_transformed)
        accuracy = accuracy_score(y_test, y_pred)

        del X_test_transformed, y_pred, current_model, y_test
        gc.collect()

        fold_acc_scores.append(accuracy)

    del tscv
    gc.collect()

    acc_mean = np.mean(fold_acc_scores)

    _log_print(log_print, "Finally", "header", newline="start")
    _log_print(log_print, "Preprocessing", "bullet", 1)
    final_preprocessor_for_pred = ColumnTransformer(
        transformers=[
            ("weekly", SinCosTransformer(period=7), ["weekday"]),
            ("nan_indicators", "passthrough", numerical_features_passthrough)
        ], remainder="drop"
    )
    final_preprocessor_for_pred.fit(X_processed)

    X_processed_transformed = final_preprocessor_for_pred.transform(X_processed)

    del X_processed
    gc.collect()

    _log_print(log_print, "Training", "bullet", 1)
    final_model_for_pred = LogisticRegressionCV(cv=5, n_jobs=-1, max_iter=1000)
    final_model_for_pred.fit(X_processed_transformed, y)
    _log_print(log_print, "Model training finished successfully", "bullet", 1)

    del X_processed_transformed
    gc.collect()

    return final_model_for_pred, final_preprocessor_for_pred, min_date_for_pred, acc_mean

In [None]:
def make_predictions(
    data_frame: pd.DataFrame,
    trained_model: LogisticRegressionCV,
    trained_preprocessor: ColumnTransformer,
    lags: int,
    min_date_from_training: pd.Timestamp
) -> pd.DataFrame:
    """
    Generates tip probability predictions for new, unseen orders using a
    pre-trained model and preprocessor.

    This function applies the same feature engineering and preprocessing steps
    used during training to the new data, then uses the trained model to make predictions.
    It specifically targets orders where the tip value is missing (NaN),
    assuming these are the instances requiring prediction.

    Args:
        data_frame (pd.DataFrame): Input DataFrame containing order data for which
                                   predictions are to be made. This DataFrame should
                                   include 'order_id', 'user_id', 'order_date',
                                   and potentially other features used during training.
                                   It should also contain 'tip' column with NaNs for
                                   prediction targets.
        trained_model (LogisticRegressionCV): The machine learning model
                                              (e.g., LogisticRegressionCV) that has
                                              already been fitted on training data.
        trained_preprocessor (ColumnTransformer): The fitted ColumnTransformer
                                                  used during training to preprocess
                                                  the features. It must be consistent
                                                  with the features expected by the model.
        lags (int): The number of historical orders (lags) used as features
                    in the autoregressive components of the model. Must match
                    the 'lags' used during model training.
        min_date_from_training (pd.Timestamp): The global minimum order date
                                                used during feature generation in the
                                                training phase. This ensures 'days_since_start'
                                                is calculated consistently.

    Returns:
        pd.DataFrame: A DataFrame containing 'order_id' and the binary
                      'tip' prediction (True/False or 1/0) for the orders
                      that initially had missing tip values.

    Raises:
        ValueError: If any NaN values are found in the final predictions,
                    indicating a potential issue in preprocessing or model output.
    """
    df_for_prediction = data_frame.copy()

    data_frame_features, _ = feature_preprocessing(
        df_for_prediction,
        lags,
        min_date_global=min_date_from_training
    )

    categorical_features = ["user_id"]
    numerical_features_to_scale = [f"tip_t-{lag}" for lag in range(1, lags + 1)] + ["days_since_start"]
    numerical_features_passthrough = [f"tip_t-{lag}_is_nan" for lag in range(1, lags + 1)]
    temporal_features = ["weekday", "hour"]

    data_frame_features = data_frame_features[data_frame_features.is_target_nan == True]

    data_frame_features.drop(columns=["tip"], inplace=True)

    features_for_pred = data_frame_features[
        categorical_features +
        numerical_features_to_scale +
        numerical_features_passthrough +
        temporal_features
    ].dropna()

    X_transformed = trained_preprocessor.transform(features_for_pred)

    predictions = trained_model.predict(X_transformed)

    data_frame["prediction"] = np.nan

    data_frame.loc[features_for_pred.index, "prediction"] = predictions

    df_final = data_frame[data_frame.tip.isna()][["order_id", "prediction"]].reset_index(drop=True)

    if df_final.prediction.isna().values.any():
        raise ValueError("Predictions contain NaN values! Check preprocessing or input data completeness.")

    df_final["prediction"] = df_final.prediction.astype(bool)

    df_final.rename(columns={"prediction": "tip"}, inplace=True)

    return df_final

In [None]:
def load_from_chunks(columns: List[str]) -> pd.DataFrame:
    extracted = []
    for c in range(1, 12):
        df = pd.read_parquet(f"chunks/{c}_orders_products_user_chunk.parquet")
        extracted.append(df[columns])

        del df, c
        gc.collect()

    return pd.concat(extracted)

In [None]:
final_model, final_preprocessor, min_date, acc_mean =\
    train_ar_extended(orders_tips, lags=4)

--- Begin Feature Preprocessing ---
	- Generate temporal feature
	- Preparing Data for Shifting
	- Generating shifted Features

	- Initializing preprocessor
	- Feature Preprocessing finished successfully!

--- Current fold: 1 ---
	- Preprocessing
	- Training
	- Testing

--- Current fold: 2 ---
	- Preprocessing
	- Training
	- Testing

--- Current fold: 3 ---
	- Preprocessing
	- Training
	- Testing

--- Current fold: 4 ---
	- Preprocessing
	- Training
	- Testing

--- Current fold: 5 ---
	- Preprocessing
	- Training
	- Testing

--- Finally ---
	- Preprocessing
	- Training
	- Model training finished successfully


In [None]:
acc_mean

np.float64(0.7380094178082193)

In [None]:
tip_temp = pd.read_csv("tip_testdaten_template_V2.csv", usecols=["order_id", "tip"])

In [None]:
tip_temp_test = pd.concat([tips, tip_temp])
tip_temp_test = tip_temp_test.merge(orders)

In [None]:
tip_temp_test = tip_temp_test[tip_temp_test.user_id.isin(tip_temp_test[tip_temp_test.tip.isna()].user_id.unique())].sort_values(["user_id", "order_date"])

In [None]:
pred_df = make_predictions(tip_temp_test, final_model, final_preprocessor, 4, min_date)

---  Begin Feature Preprocessing ---
	- Generate temporal feature
	- Preparing Data for Shifting
	- Generating shifted Features

	- Initializing preprocessor
	- Feature Preprocessing finished successfully!


In [None]:
pred_df.to_csv("task_2g_pred.csv")

# Aufgabe 3

## Further interesting variables

Following up after modeling the trend and the seasonalities, we are now going to identify further variables that may provide some additional information regarding the users' tipping behaviors.



In [None]:
class DateConversionTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, min_date_global: pd.Timestamp = None, log_print: bool = True) -> None:
        self.min_date_global = min_date_global
        self.log_print = log_print

    def fit(self, X: pd.DataFrame, y: pd.Series = None) -> DateConversionTransformer:
        if self.min_date_global is None:
            self.min_date_global = pd.to_datetime(X["order_date"]).min()

        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        _log_print(self.log_print, "Begin date conversion", "header")
        X = X.copy()
        if not pd.api.types.is_datetime64_any_dtype(X["order_date"]):
            X["order_date"] = pd.to_datetime(X["order_date"])
        X["days_since_start"] = (X["order_date"] - self.min_date_global).dt.days

        return X

In [None]:
class ClusterAssignmentTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, log_print: bool = True) -> None:
        self.log_print = log_print

    def fit(self, X: pd.DataFrame, y: pd.Series = None) -> ClusterAssignmentTransformer:
        user_dept_df = X.pivot_table(index="user_id", columns="tip", aggfunc="size", fill_value=0)

        scaler = StandardScaler()
        scaled = scaler.fit_transform(user_dept_df)

        kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
        self.cluster_mapping_ = pd.DataFrame({
            "user_id": user_dept_df.index,
            "cluster": kmeans.fit_predict(scaled)
        })

        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        _log_print(self.log_print, "Begin cluster generation", "header")
        X = X.copy()
        X = X.merge(self.cluster_mapping_, how="left", on="user_id")
        X["cluster"] = X["cluster"].fillna(-1).astype(int).astype(str)

        return X

In [None]:
class OrganicProductsFlagTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, log_print: bool = True):
        self.log_print = log_print

    def fit(self, X: pd.DataFrame, y: pd.Series = None) -> OrganicProductsFlagTransformer:
        chunks = []
        for c in range(1, 12):
            df = pd.read_parquet(f"chunks/{c}_orders_products_user_chunk.parquet")
            df["order_contains_organic"] = df["product_name"].str.lower().str.contains("organic")
            chunks.append(df[["order_id", "order_contains_organic"]])

            del df
            gc.collect()

        self.organic_df_ = pd.concat(chunks).drop_duplicates("order_id")

        del chunks
        gc.collect()

        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        _log_print(self.log_print, "Begin organic-products-flagging", "header")
        X = X.copy()
        X = X.merge(self.organic_df_, how="left", on="order_id")

        del self.organic_df_
        gc.collect()

        X["order_contains_organic"] = X["order_contains_organic"].fillna(False)

        return X

In [None]:
class OrderCountTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, log_print: bool = True) -> None:
        self.log_print = log_print

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        _log_print(self.log_print, "Counting orders per user", "header")
        X = X.copy()
        X = X.merge(X.groupby("user_id").agg(no_orders=("order_id", "nunique")), how="left", on="user_id")

        return X

In [None]:
class CartSizeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, log_print: bool = True) -> None:
        self.log_print = log_print

    def fit(self, X, y=None):
        chunks = []
        for c in range(1, 12):
            df = pd.read_parquet(f"chunks/{c}_orders_products_user_chunk.parquet")
            chunks.append(df[["order_id", "add_to_cart_order"]])

            del df
            gc.collect()

        self.cart_size_df_ = pd.concat(chunks)

        del chunks
        gc.collect()

        return self

    def transform(self, X):
        _log_print(self.log_print, "Getting cart size", "header")
        X = X.copy()

        self.cart_size_df_.sort_values(["order_id", "add_to_cart_order"], inplace=True)
        X = X.merge(
            self.cart_size_df_.groupby("order_id", as_index=False).agg(cart_size=("add_to_cart_order", "max")),
            on="order_id"
        )

        del self.cart_size_df_
        gc.collect()

        return X

In [None]:
class MeanNoProductsOrderedTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, log_print: bool = True) -> None:
        self.log_print = log_print

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        _log_print(self.log_print, "Making AVG cart size", "header")
        X = X.copy()

        X = X.merge(X.groupby("user_id", as_index=False).agg(avg_no_prod=("cart_size", "mean")), on="user_id")

        return X

In [None]:
class TemporalFeatureGenerator(BaseEstimator, TransformerMixin):
    def __init__(self, min_date_global: pd.Timestamp, log_print: bool = True) -> None:
        self.min_date_global = min_date_global
        self.log_print = log_print

    def fit(self, X: pd.DataFrame, y: pd.Series = None) -> TemporalFeatureGenerator:
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        _log_print(self.log_print, "Begin generating temporal features", "header")
        X = X.copy()
        X["weekday"] = X["order_date"].dt.day_of_week
        X["hour"] = X["order_date"].dt.hour

        us_holidays = holidays.UnitedStates(years=X["order_date"].dt.year.unique())
        X["is_holiday"] = X["order_date"].dt.date.isin(us_holidays.keys())

        return X

In [None]:
class LagFeatureGenerator(BaseEstimator, TransformerMixin):
    def __init__(self, lags: int, log_print: bool = True) -> None:
        self.lags = lags
        self.log_print = log_print

    def fit(self, X: pd.DataFrame, y: pd.Series = None) -> LagFeatureGenerator:
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        _log_print(self.log_print, "Begin generating lag features", "header")
        X = X.copy()

        X.sort_values(["user_id", "order_date"], inplace=True)
        X["is_target_nan"] = X["tip"].isna()

        if X["tip"].isna().any():
            X["tip_for_shifting"] = X.groupby("user_id")["tip"].ffill()
            X["tip_for_shifting"] = X.groupby("user_id")["tip_for_shifting"].bfill()
        else:
            X["tip_for_shifting"] = X["tip"]

        for lag in range(1, self.lags + 1):
            lag_col = f"tip_t-{lag}"
            nan_col = f"tip_t-{lag}_is_nan"
            X[lag_col] = X.groupby("user_id")["tip_for_shifting"].shift(lag)
            X[nan_col] = X[lag_col].isna().astype(int)

        return X

In [None]:
class OverallTipProbaTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, log_print: bool = True):
        self.log_print = log_print

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        _log_print(self.log_print, "Begin overall tip", "header")
        X = X.copy()
        X["overall_tip_proba"] = X["tip"].mean() * 100

        return X

In [None]:
class TipProbaPerHourTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, log_print: bool = True):
        self.log_print = log_print

    def fit(self, X, y=None):
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        _log_print(self.log_print, "Begin tip proba per hour", "header")
        X = X.copy()
        X = X.merge(X.groupby("hour", as_index=False).agg(tip_proba_per_hour=("tip", "mean")), how="left", on="hour")

        X["tip_proba_per_hour"] = X["tip_proba_per_hour"] * 100

        X.sort_values("tip_proba_per_hour", ascending=False, inplace=True)

        return X

In [None]:
class TipProbaPerWeekdayTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, log_print: bool = True):
        self.log_print = log_print

    def fit(self, X: pd.DataFrame, y: pd.Series = None) -> TipProbaPerWeekdayTransformer:
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        _log_print(self.log_print, "Begin tip proba by weekday", "header")
        X = X.copy()
        X = X.merge(X.groupby("weekday", as_index=False).agg(tip_proba_per_weekday=("tip", "mean")), how="left", on="weekday")

        X["tip_proba_per_weekday"] = X["tip_proba_per_weekday"] * 100

        X.sort_values("tip_proba_per_weekday", ascending=False, inplace=True)

        return X

In [None]:
class TipProbaPerDepartmentTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, log_print):
        self.log_print = log_print

    def fit(self, X, y=None):
        chunks = []
        for c in range(1, 12):
            df = pd.read_parquet(f"chunks/{c}_orders_products_user_chunk.parquet")
            chunks.append(df[["order_id", "department"]])

            del df
            gc.collect()

        self.dept_df_ = pd.concat(chunks)

        del chunks
        gc.collect()

        return self

    def transform(self, X):
        _log_print(self.log_print, "Begin tip proba by department", "header")
        X = X.copy()
        X = X.merge(self.dept_df_)

        del self.dept_df_
        gc.collect()

        X = X.merge(X.groupby("department", as_index=False).agg(tip_proba_per_department=("tip", "mean")), how="left", on="department")

        X["tip_proba_per_department"] = X["tip_proba_per_department"] * 100

        X.sort_values("tip_proba_per_department", ascending=False, inplace=True)

        return X

In [None]:
def build_feature_pipeline(lags: int, min_date_global: pd.Timestamp = None, log_print: bool = True) -> Pipeline:
    categorical_features = ["order_contains_organic", "is_holiday", "cluster"]
    numerical_features_to_scale = ["avg_no_prod", "overall_tip_proba",
                                   "tip_proba_per_hour", "tip_proba_per_weekday",
                                   "tip_proba_per_department", "days_since_start"]
    numerical_features_to_minmax = ["no_orders", "cart_size"]
    shifted_features = [f"tip_t-{lag}" for lag in range(1, lags + 1)]
    nan_indicators = [f"tip_t-{lag}_is_nan" for lag in range(1, lags + 1)]

    preprocessor = ColumnTransformer([
        ("imputation", SimpleImputer(strategy="most_frequent"), shifted_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", StandardScaler(), numerical_features_to_scale),
        ("no", MinMaxScaler(), numerical_features_to_minmax),
        ("weekly", SinCosTransformer(period=7), ["weekday"]),
        ("nan_indicators", "passthrough", nan_indicators),
    ], remainder="drop")

    pipeline = Pipeline([
        ("date_conversion", DateConversionTransformer(min_date_global=min_date_global, log_print=log_print)),
        ("cluster_generation", ClusterAssignmentTransformer(log_print=log_print)),
        ("lag_features", LagFeatureGenerator(lags=lags, log_print=log_print)),
        ("organic_flag", OrganicProductsFlagTransformer(log_print=log_print)),
        ("count_orders", OrderCountTransformer(log_print=log_print)),
        ("cart_size", CartSizeTransformer(log_print=log_print)),
        ("avg_no_prod", MeanNoProductsOrderedTransformer(log_print=log_print)),
        ("temporal_features", TemporalFeatureGenerator(min_date_global=min_date_global, log_print=log_print)),
        ("overall_tip_proba", OverallTipProbaTransformer(log_print=log_print)),
        ("tip_proba_per_hour", TipProbaPerHourTransformer(log_print=log_print)),
        ("tip_proba_per_weekday", TipProbaPerWeekdayTransformer(log_print=log_print)),
        ("tip_proba_per_department", TipProbaPerDepartmentTransformer(log_print=log_print)),
        ("column_preprocessor", preprocessor)
    ])

    return pipeline

In [None]:
def feature_engineering_only(df: pd.DataFrame, lags: int, min_date_global=None, log_print=True) -> pd.DataFrame:
    pipeline = build_feature_pipeline(lags, min_date_global, log_print=log_print)

    steps_except_last = pipeline.steps[:-1]

    for name, transformer in steps_except_last:
        df = transformer.fit_transform(df)

    return df

In [None]:
def train_ar_final(df_input: pd.DataFrame, lags: int, log_print: bool = True) -> Tuple[LogisticRegressionCV, ColumnTransformer, pd.Timestamp, float]:
    df_input = df_input.copy()

    min_date_global_training = df_input.order_date.min()

    df_with_all_features = feature_engineering_only(df_input, lags, min_date_global=min_date_global_training, log_print=log_print)

    df_cleaned_for_training = df_with_all_features[~df_with_all_features["is_target_nan"]].copy()

    min_date_for_pred = df_with_all_features.order_date.min()

    del df_with_all_features
    gc.collect()

    df_cleaned_for_training.set_index("order_date", inplace=True)

    y = df_cleaned_for_training.pop("tip")

    categorical_features = ["order_contains_organic", "is_holiday", "cluster"]
    numerical_features_to_scale = ["avg_no_prod", "overall_tip_proba",
                                   "tip_proba_per_hour", "tip_proba_per_weekday",
                                   "tip_proba_per_department"] + ["days_since_start"]
    numerical_features_to_minmax = ["no_orders", "cart_size"]
    numerical_features_passthrough = [f"tip_t-{lag}_is_nan" for lag in range(1, lags + 1)]
    shifted_features = [f"tip_t-{lag}" for lag in range(1, lags + 1)]
    temporal_features = ["weekday"]

    X_processed = df_cleaned_for_training[categorical_features + numerical_features_to_scale + numerical_features_passthrough + temporal_features + shifted_features + ["no_orders", "cart_size"]]

    del df_cleaned_for_training
    gc.collect()

    fold_acc_scores = []
    tscv = TimeSeriesSplit(n_splits=5)
    for i, (train_idx, test_idx) in enumerate(tscv.split(X_processed)):
        _log_print(log_print, f"Current fold: {i + 1}", "header", newline="start")
        X_train, X_test = X_processed.iloc[train_idx], X_processed.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        del train_idx, test_idx
        gc.collect()

        _log_print(log_print, "Preprocessing", "bullet", 1)
        current_preprocessor = ColumnTransformer(
            transformers=[
                ("imputation", SimpleImputer(strategy="most_frequent"), shifted_features),
                ("nan_indicators", "passthrough", numerical_features_passthrough),
                ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
                ("num", StandardScaler(), numerical_features_to_scale),
                ("no", MinMaxScaler(), numerical_features_to_minmax),
                ("weekly", SinCosTransformer(period=7), ["weekday"])
            ], remainder="drop"
        )
        current_preprocessor.fit(X_train)

        X_train_transformed = current_preprocessor.transform(X_train)
        X_test_transformed = current_preprocessor.transform(X_test)

        del current_preprocessor, X_train, X_test
        gc.collect()

        _log_print(log_print, "Training", "bullet", 1)
        current_model = LogisticRegressionCV(cv=5, n_jobs=-1, max_iter=1000)
        current_model.fit(X_train_transformed, y_train)

        del X_train_transformed, y_train
        gc.collect()

        _log_print(log_print, "Testing", "bullet", 1)
        y_pred = current_model.predict(X_test_transformed)
        accuracy = accuracy_score(y_test, y_pred)

        _log_print(log_print, f"Accuracy Fold {i + 1}: {accuracy}", "text", 1)

        del X_test_transformed, y_pred, current_model, y_test
        gc.collect()

        fold_acc_scores.append(accuracy)

    del tscv
    gc.collect()

    acc_mean = np.mean(fold_acc_scores)

    _log_print(log_print, "Finally", "header", newline="start")
    _log_print(log_print, "Preprocessing", "bullet", 1)
    final_preprocessor_for_pred = ColumnTransformer(
        transformers=[
            ("imputation", SimpleImputer(strategy="most_frequent"), shifted_features),
            ("nan_indicators", "passthrough", numerical_features_passthrough),
            ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
            ("num", StandardScaler(), numerical_features_to_scale),
            ("no", MinMaxScaler(), numerical_features_to_minmax),
            ("weekly", SinCosTransformer(period=7), ["weekday"])
        ], remainder="drop"
    )
    final_preprocessor_for_pred.fit(X_processed)

    X_processed_transformed = final_preprocessor_for_pred.transform(X_processed)

    del X_processed
    gc.collect()

    _log_print(log_print, "Training", "bullet", 1)
    final_model_for_pred = LogisticRegressionCV(cv=5, n_jobs=-1, max_iter=1000)
    final_model_for_pred.fit(X_processed_transformed, y)
    _log_print(log_print, "Model training finished successfully", "bullet", 1)

    del X_processed_transformed
    gc.collect()

    return final_model_for_pred, final_preprocessor_for_pred, min_date_for_pred, acc_mean

In [None]:
final_model, final_preprocessor, min_date, acc_mean =\
    train_ar(orders_tips, lags=4)

--- Begin date conversion ---
--- Begin cluster generation ---
--- Begin generating lag features ---
--- Begin organic-products-flagging ---
--- Counting orders per user ---
--- Getting cart size ---
--- Making AVG cart size ---
--- Begin generating temporal features ---
--- Begin overall tip ---
--- Begin tip proba per hour ---
--- Begin tip proba by weekday ---
--- Begin tip proba by department ---

--- Current fold: 1 ---
	- Preprocessing
	- Training
	- Testing
	Accuracy Fold 1: 0.7532378035213362

--- Current fold: 2 ---
	- Preprocessing
	- Training
	- Testing
	Accuracy Fold 2: 0.7599935121242178

--- Current fold: 3 ---
	- Preprocessing
	- Training


In [None]:
acc_mean

np.float64(0.7416857876712329)