In [11]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [12]:
test_df = pd.read_parquet("Datasets/test_data.parquet")

offers_df = pd.read_parquet("Datasets/offer_metadata.parquet")

In [13]:
# Import function from offers_features.ipynb
%run offers_features.ipynb

# Generate and merge offer features
test_offer_df = generate_offer_features(test_df, offers_df)

[✔️] Offers merged and features engineered


In [14]:
# Load events
events_df = pd.read_parquet("Datasets/add_event.parquet")

# Import and apply event feature function
%run events_features.ipynb
test_offer_events_df = generate_event_features(test_offer_df, events_df)


[✓] Preprocessed events.
[✓] Merged offer-level event features into train_df.


Unnamed: 0,interaction_id,customer_id,offer_id,impression_timestamp,impression_date,f1,f2,f3,f4,f5,...,discount_per_day,offer_event_count,unique_event_types,avg_event_hour,earliest_event,latest_event,event_span_days,Mobile_Timeline,OffersTab,Tiles
0,1362907_91950_16-23_2023-11-04 18:56:26.000794,1362907,91950,2023-11-04 18:56:26.000794,2023-11-04,,,,,,...,,57211.0,3.0,12.203073,2023-10-22 00:03:45.658,2023-11-03 23:58:47.862,12.0,5664.0,31199.0,20348.0
1,1082599_88356_16-23_2023-11-04 06:08:53.373,1082599,88356,2023-11-04 06:08:53.373,2023-11-04,,9.0,,,,...,,5832.0,3.0,12.158436,2023-10-22 01:13:58.013,2023-11-03 23:47:15.524,12.0,524.0,3347.0,1961.0
2,1888466_958700_16-23_2023-11-05 10:07:28.000725,1888466,958700,2023-11-05 10:07:28.000725,2023-11-05,,,,,22.0,...,,16311.0,3.0,11.492061,2023-11-01 00:02:13.493,2023-11-03 23:58:59.846,2.0,1115.0,9664.0,5532.0
3,1888971_795739_16-23_2023-11-04 12:25:28.244,1888971,795739,2023-11-04 12:25:28.244,2023-11-04,,,,,,...,,58814.0,3.0,12.137756,2023-10-22 00:00:11.504,2023-11-03 23:59:11.630,12.0,5822.0,31574.0,21418.0
4,1256369_82296_16-23_2023-11-05 06:45:26.657,1256369,82296,2023-11-05 06:45:26.657,2023-11-05,,,,,,...,,52802.0,3.0,12.171887,2023-10-22 00:00:47.528,2023-11-03 23:57:34.510,12.0,5160.0,29532.0,18110.0
5,1750220_98194407_16-23_2023-11-04 05:46:54.823,1750220,98194407,2023-11-04 05:46:54.823,2023-11-04,,,,,44.0,...,0.103448,18543.0,3.0,11.703015,2023-11-01 00:02:49.037,2023-11-03 23:59:14.658,2.0,1364.0,10983.0,6196.0
6,1406615_18473108_16-23_2023-11-04 21:39:43.485,1406615,18473108,2023-11-04 21:39:43.485,2023-11-04,,,42.0,,21.0,...,0.206897,18188.0,3.0,11.530075,2023-11-01 00:02:48.002,2023-11-03 23:59:01.309,2.0,1355.0,10681.0,6152.0
7,1354131_88148_16-23_2023-11-05 16:57:56.684,1354131,88148,2023-11-05 16:57:56.684,2023-11-05,,,,,,...,,62482.0,3.0,12.171537,2023-10-22 00:02:18.788,2023-11-03 23:58:50.875,12.0,5589.0,35126.0,21767.0
8,1457351_399752_16-23_2023-11-04 06:00:04.106,1457351,399752,2023-11-04 06:00:04.106,2023-11-04,,9.0,,,43.0,...,,57934.0,3.0,12.237132,2023-10-22 00:03:11.758,2023-11-03 23:59:11.627,12.0,5699.0,31351.0,20884.0
9,1461171_45856_16-23_2023-11-04 15:08:26.802,1461171,45856,2023-11-04 15:08:26.802,2023-11-04,,,,,6.0,...,,49386.0,3.0,12.230146,2023-10-22 00:00:22.747,2023-11-03 23:58:12.010,12.0,4849.0,25937.0,18600.0


In [15]:
transactions_df = pd.read_parquet("Datasets/add_trans.parquet")


# Run the transactions feature notebook
%run transaction_features.ipynb

# Apply global transaction features to train data
test_OET_df = generate_global_transaction_features(test_offer_events_df, transactions_df)


[✓] Global transaction stats merged into train set.


Unnamed: 0,interaction_id,customer_id,offer_id,impression_timestamp,impression_date,f1,f2,f3,f4,f5,...,latest_event,event_span_days,Mobile_Timeline,OffersTab,Tiles,global_avg_transaction,global_max_transaction,global_min_transaction,global_top_category,global_top_type
0,1362907_91950_16-23_2023-11-04 18:56:26.000794,1362907,91950,2023-11-04 18:56:26.000794,2023-11-04,,,,,,...,2023-11-03 23:58:47.862,12.0,5664.0,31199.0,20348.0,182.232674,1116928.8,0.01,PR,D
1,1082599_88356_16-23_2023-11-04 06:08:53.373,1082599,88356,2023-11-04 06:08:53.373,2023-11-04,,9.0,,,,...,2023-11-03 23:47:15.524,12.0,524.0,3347.0,1961.0,182.232674,1116928.8,0.01,PR,D
2,1888466_958700_16-23_2023-11-05 10:07:28.000725,1888466,958700,2023-11-05 10:07:28.000725,2023-11-05,,,,,22.0,...,2023-11-03 23:58:59.846,2.0,1115.0,9664.0,5532.0,182.232674,1116928.8,0.01,PR,D
3,1888971_795739_16-23_2023-11-04 12:25:28.244,1888971,795739,2023-11-04 12:25:28.244,2023-11-04,,,,,,...,2023-11-03 23:59:11.630,12.0,5822.0,31574.0,21418.0,182.232674,1116928.8,0.01,PR,D
4,1256369_82296_16-23_2023-11-05 06:45:26.657,1256369,82296,2023-11-05 06:45:26.657,2023-11-05,,,,,,...,2023-11-03 23:57:34.510,12.0,5160.0,29532.0,18110.0,182.232674,1116928.8,0.01,PR,D


In [16]:
drop_cols = [
    'offer_code_type', 'offer_extra_flag',
    'offer_start', 'offer_end', 'avg_event_hour', 'earliest_event',
    'latest_event', 'global_top_category', 'global_top_type','id8'
]

test_OET_df.drop(columns=drop_cols, inplace=True)

In [17]:
# 1. Save ID columns if needed for submission later
X_test_ids = test_OET_df[['customer_id', 'offer_id']].copy()

# 2. Timestamp processing
# test_OET_df['impression_timestamp'] = pd.to_datetime(test_OET_df['impression_timestamp'], errors='coerce')
# test_OET_df['impression_hour'] = test_OET_df['impression_timestamp'].dt.hour
# test_OET_df['impression_weekday'] = test_OET_df['impression_timestamp'].dt.weekday
# test_OET_df = test_OET_df.drop(columns=['impression_timestamp'])

# 3. Drop identifier columns
X_test = test_OET_df.drop(columns=['customer_id', 'offer_id'])

# 4. One-hot encoding to match training
X_test = pd.get_dummies(X_test, columns=['offer_type', 'offer_group', 'f374'])

final_features = ['offer_duration_days', 'offer_event_count', 'unique_event_types', 'event_span_days', 'Mobile_Timeline', 'OffersTab', 'Tiles', 'global_avg_transaction', 'global_max_transaction', 'global_min_transaction', 'f127', 'f365', 'f124', 'f148', 'f134', 'f200', 'f351', 'f169', 'f137', 'f58', 'f131', 'f167', 'f130', 'f362', 'f147', 'f103', 'f132', 'f201', 'f173', 'f67', 'f186', 'f76', 'f47', 'f203', 'f143', 'f123', 'f171', 'f202', 'f95', 'f98', 'f41', 'f142', 'f315', 'f166', 'f99', 'f204', 'f150', 'f149', 'f341', 'f96', 'f113', 'f43', 'f216', 'f223', 'f105', 'f168', 'f125', 'f158', 'f77', 'f46', 'f133', 'f69', 'f348', 'f68', 'f138', 'f199', 'f316', 'f146', 'f74', 'f30', 'f350', 'f172', 'f140', 'f51', 'f59', 'f322', 'f198', 'f320', 'f344', 'f93', 'f85', 'f170', 'f97', 'f151', 'f319', 'f318', 'f94', 'f106', 'f338', 'f336']


# 5. Align columns with training features (important to avoid column mismatch)
X_test = X_test.reindex(columns=final_features, fill_value=0)

# 6. Convert object to category if any
for col in X_test.select_dtypes(include='object').columns:
    X_test[col] = X_test[col].astype('category')

# ✅ X_test is now ready for prediction


In [18]:
import joblib

# Load model
model_reduced = joblib.load('model_reduced_xgb.pkl')


In [19]:
# Get probability predictions
test_preds = model_reduced.predict_proba(X_test)[:, 1]


In [20]:
# Step 1: Get the required ID columns
submission_df = test_OET_df[['interaction_id', 'customer_id', 'offer_id', 'impression_date']].copy()

# Step 2: Rename columns to match submission format
submission_df.rename(columns={
    'interaction_id': 'id1',
    'customer_id': 'id2',
    'offer_id': 'id3',
    'impression_date': 'id5'
}, inplace=True)

# Step 3: Attach predictions
submission_df['pred'] = test_preds

# Step 4: Optional - sort by id1 if required by submission format
# submission_df = submission_df.sort_values('id1')

# Step 5: Save to CSV
submission_df.to_csv("submission.csv", index=False)
print("[✔️] submission.csv created with", len(submission_df), "rows.")


[✔️] submission.csv created with 369301 rows.
