In [1]:
import pandas as pd
from IPython.display import display
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

dir_suffix = "data/"
dir_suffix_entpackt = dir_suffix + "Daten_ausgepackt/"

In [2]:
print("Tip25W Testdaten Template:")
testdata = pd.read_csv(dir_suffix+"tip25W_testdaten_template.csv")

display(testdata.head(12))

print("Tip25W Trainingsdaten:")
trainingdata = pd.read_csv(dir_suffix+"tip25W_trainingsdaten.csv")
# ist das gleiche wie orders gefiltert nach nur prior und das gleiche wie der prior Datensatz
#trainingdata = trainingdata[:300000]
display(trainingdata.head(11))

print("Prior:")
df_orders_prior = pd.read_csv(dir_suffix_entpackt+ "order_products__prior.csv")
display(df_orders_prior.head(10))

print("train:")
df_orders_train = pd.read_csv(dir_suffix_entpackt + "order_products__train.csv")
display(df_orders_train)

print("orders")
df_orders = pd.read_csv(dir_suffix_entpackt + "orders.csv")
display(df_orders.head(10))

print("products:")
df_products = pd.read_csv(dir_suffix_entpackt + "products.csv")
display(df_products.head(10))



Tip25W Testdaten Template:


Unnamed: 0.1,Unnamed: 0,order_id,tip,tip_pc
0,10,1187899,,
1,25,1492625,,
2,49,2196797,,
3,74,525192,,
4,78,880375,,
5,82,1094988,,
6,88,1822501,,
7,115,1827621,,
8,129,2316178,,
9,200,2180313,,


Tip25W Trainingsdaten:


Unnamed: 0.1,Unnamed: 0,order_id,tip,tip_pc
0,0,2539329,True,0.092177
1,1,2398795,False,0.0
2,2,473747,False,0.0
3,3,2254736,True,0.102231
4,4,431534,True,0.09299
5,5,3367565,False,0.0
6,6,550135,False,0.0
7,7,3108588,False,0.0
8,8,2295261,False,0.0
9,9,2550362,False,0.0


Prior:


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0
5,2,17794,6,1
6,2,40141,7,1
7,2,1819,8,1
8,2,43668,9,0
9,3,33754,1,1


train:


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1
...,...,...,...,...
1384612,3421063,14233,3,1
1384613,3421063,35548,4,1
1384614,3421070,35951,1,1
1384615,3421070,16953,2,1


orders


Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
5,3367565,1,prior,6,2,7,19.0
6,550135,1,prior,7,1,9,20.0
7,3108588,1,prior,8,1,14,14.0
8,2295261,1,prior,9,1,16,0.0
9,2550362,1,prior,10,4,8,30.0


products:


Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13
5,6,Dry Nose Oil,11,11
6,7,Pure Coconut Water With Orange,98,7
7,8,Cut Russet Potatoes Steam N' Mash,116,1
8,9,Light Strawberry Blueberry Yogurt,120,16
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7


In [3]:

merged = trainingdata.merge(df_orders_prior, on="order_id", how="left")
merged = merged.merge(df_orders, on="order_id", how="left")

merged = merged.fillna(0)

#order_merged = order_merged.merge(df_products, on="product_id", how="left")
display(merged.head(5))
print(merged["eval_set"].unique())

Unnamed: 0.1,Unnamed: 0,order_id,tip,tip_pc,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,0,2539329,True,0.092177,196,1,0,1,prior,1,2,8,0.0
1,0,2539329,True,0.092177,14084,2,0,1,prior,1,2,8,0.0
2,0,2539329,True,0.092177,12427,3,0,1,prior,1,2,8,0.0
3,0,2539329,True,0.092177,26088,4,0,1,prior,1,2,8,0.0
4,0,2539329,True,0.092177,26405,5,0,1,prior,1,2,8,0.0


['prior']


In [4]:
order_features = (
    df_orders_prior
    .groupby("order_id")
    .agg(
        num_products=("product_id", "count"),
        num_reordered=("reordered", "sum"),
        avg_add_to_cart=("add_to_cart_order", "mean"),
        max_add_to_cart=("add_to_cart_order", "max"),
    )
    .reset_index()
)

order_features = order_features.merge(
    df_orders,
    on="order_id",
    how="left"
)

final_df = order_features.merge(
    trainingdata[["order_id", "tip", "tip_pc"]],
    on="order_id",
    how="left"
)

final_df.head(10)

Unnamed: 0,order_id,num_products,num_reordered,avg_add_to_cart,max_add_to_cart,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,tip,tip_pc
0,2,9,6,5.0,9,202279,prior,3,5,9,8.0,False,0.0
1,3,8,8,4.5,8,205970,prior,16,5,17,12.0,False,0.0
2,4,13,12,7.0,13,178520,prior,36,1,9,7.0,True,0.11591
3,5,26,21,13.5,26,156122,prior,42,6,16,9.0,False,0.0
4,6,3,0,2.0,3,22352,prior,4,1,12,30.0,True,0.075219
5,7,2,0,1.5,2,142903,prior,11,2,14,30.0,False,0.0
6,8,1,1,1.0,1,3107,prior,5,4,6,17.0,False,0.0
7,9,15,10,8.0,15,139016,prior,14,0,19,5.0,False,0.0
8,10,15,8,8.0,15,135442,prior,4,6,8,8.0,True,0.132824
9,11,5,5,3.0,5,143742,prior,4,1,19,23.0,True,0.139572


In [5]:
display(merged.head(1))

Unnamed: 0.1,Unnamed: 0,order_id,tip,tip_pc,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,0,2539329,True,0.092177,196,1,0,1,prior,1,2,8,0.0


In [6]:
#df_numeric =  merged[["add_to_cart_order", "reordered", "order_hour_of_day", "days_since_prior_order", "aisle_id", "department_id"]]
#df_numeric.corr()
#order_df = merged.groupby("order_id").agg(
#    tip_pc=("tip_pc", "mean"),
#    eval_set=("eval_set",),
#    n_items=("product_id", "count"),
#    mean_reordered=("reordered", "mean"),
#    n_aisles=("aisle_id", "nunique"),
#    n_departments=("department_id", "nunique"),
#    order_number=("order_number", "first"),
#    order_hour_of_day=("order_hour_of_day", "first"),
#    order_dow=("order_dow", "first"),
#    days_since_prior_order=("days_since_prior_order", "first"),
#).reset_index()
order_df = merged.drop(["Unnamed: 0", "eval_set"], axis=1)

#order_df.head(10)

In [7]:
df_merged = order_df.merge(
    df_products[['product_id', 'department_id']],
    on='product_id',
    how='left'
)
df_merged = df_merged.drop(columns='product_id')
df_merged.head(10)

Unnamed: 0,order_id,tip,tip_pc,add_to_cart_order,reordered,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,department_id
0,2539329,True,0.092177,1,0,1,1,2,8,0.0,7
1,2539329,True,0.092177,2,0,1,1,2,8,0.0,16
2,2539329,True,0.092177,3,0,1,1,2,8,0.0,19
3,2539329,True,0.092177,4,0,1,1,2,8,0.0,19
4,2539329,True,0.092177,5,0,1,1,2,8,0.0,17
5,2398795,False,0.0,1,1,1,2,3,7,15.0,7
6,2398795,False,0.0,2,0,1,2,3,7,15.0,19
7,2398795,False,0.0,3,1,1,2,3,7,15.0,19
8,2398795,False,0.0,4,0,1,2,3,7,15.0,4
9,2398795,False,0.0,5,1,1,2,3,7,15.0,19


In [8]:



#X["is_first_order"] = (X["order_number"] == 1).astype(int)
#X["days_since_prior_order"] = X["days_since_prior_order"].fillna(0)
#X.head(10)
order_df.head(3)


single = order_df.tail(1) 
order_df =  order_df.iloc[:-1].copy() 





In [9]:
X = order_df.drop(columns="tip_pc")
y = order_df["tip_pc"]


X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,      # 20 % Testdaten
    random_state=42     # reproduzierbar
)



In [10]:
X_train.head(3)
X_test

Unnamed: 0,order_id,tip,product_id,add_to_cart_order,reordered,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
7986815,2692768,True,21162,3,1,50607,41,4,14,4.0
19731572,3162797,True,20734,15,0,125125,81,6,6,4.0
20729964,2792323,True,881,17,0,131719,15,0,22,13.0
26952299,1506570,True,44661,12,0,171374,1,1,14,0.0
32027675,774117,False,4605,16,0,203527,8,6,15,2.0
...,...,...,...,...,...,...,...,...,...,...
25037128,294434,False,32650,1,1,158943,47,2,17,1.0
22102041,2019794,True,19398,8,0,140407,7,2,8,13.0
20643730,133328,True,6891,4,1,131125,62,1,12,2.0
29312756,669518,False,25890,15,1,186193,12,0,14,11.0


In [11]:
numeric_features = [
    "order_id",
    "tip",
    "product_id",
    "add_to_cart_order",
    "reordered",
    "user_id",
    "order_number",
    "order_dow",
    "order_hour_of_day"
    
]
categorical_features = [
    "product_name"
]


preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features)
    ]
)

In [12]:

order_df.head(5)
X_test

Unnamed: 0,order_id,tip,product_id,add_to_cart_order,reordered,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
7986815,2692768,True,21162,3,1,50607,41,4,14,4.0
19731572,3162797,True,20734,15,0,125125,81,6,6,4.0
20729964,2792323,True,881,17,0,131719,15,0,22,13.0
26952299,1506570,True,44661,12,0,171374,1,1,14,0.0
32027675,774117,False,4605,16,0,203527,8,6,15,2.0
...,...,...,...,...,...,...,...,...,...,...
25037128,294434,False,32650,1,1,158943,47,2,17,1.0
22102041,2019794,True,19398,8,0,140407,7,2,8,13.0
20643730,133328,True,6891,4,1,131125,62,1,12,2.0
29312756,669518,False,25890,15,1,186193,12,0,14,11.0


In [13]:
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor


model = LinearRegression()
model = HistGradientBoostingRegressor(
    learning_rate=0.05,
    max_depth=6,
    max_iter=300,         # entspricht grob n_estimators
    early_stopping=True,  # stoppt wenn’s nicht mehr besser wird
    random_state=42
)
pipeline = Pipeline(
    steps=[
        #("preprocessor", preprocessor),
        ("model", model)
    ]
)


pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)


display(single)

single_pred = single.drop(["tip_pc"], axis=1)

print(pipeline.predict(single_pred))

print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R²:", r2_score(y_test, y_pred))

Unnamed: 0,order_id,tip,tip_pc,product_id,add_to_cart_order,reordered,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
32434488,2977660,True,0.08858,22920,9,0,206209,13,1,12,7.0


[0.13356236]
MAE: 0.012911611605693986
RMSE: 0.02047254877352178
R²: 0.9037209592892494


In [14]:
X.isna().sum().sort_values(ascending=False).head(20)


order_id                  0
tip                       0
product_id                0
add_to_cart_order         0
reordered                 0
user_id                   0
order_number              0
order_dow                 0
order_hour_of_day         0
days_since_prior_order    0
dtype: int64