In [40]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor

In [41]:

# Load the dataset into a Pandas DataFrame
df = pd.read_csv('actions2load.csv')

In [42]:


# Preprocess the data
df = df.drop(columns=['additional_data']) # Drop irrelevant column
df['event_time'] = pd.to_datetime(df['event_time']) # Convert event_time to datetime object
df = df.sort_values(by='event_time') # Sort the dataframe by event_time
df['time_since_last_event'] = df.groupby('account_id')['event_time'].diff().dt.total_seconds() # Calculate time since last event for each account_id
df = df.dropna() # Drop any rows with missing values


In [43]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['event_time']), df['event_time'], test_size=0.2, random_state=42)

# Define the feature matrix X and target variable y for both the training and testing sets
X_train = X_train.drop(columns=['account_id', 'event_type', 'product_id'])
X_test = X_test.drop(columns=['account_id', 'event_type', 'product_id'])

In [44]:

# Create an instance of the XGBRegressor class
model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)

y_train = y_train.apply(lambda x: x.timestamp())

# Fit the model to the training data
model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [45]:
# Use the model to make predictions for the testing data
y_pred = model.predict(X_test)

y_test = y_test.apply(lambda x: x.timestamp())


# Evaluate the performance of the model using mean absolute error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print('MAE:', mae)



MAE: 3155653.5510186655


In [46]:
df["event_time"].max()

Timestamp('2020-06-04 04:20:13.271000')

In [47]:
# Use the trained model to make predictions for the next event time for a given account_id
account_id = '3eff5d4b55889abb0a8b08d2136f1b63' # Replace with the desired account_id
last_event_time = df[df['account_id'] == account_id]['event_time'].max()# Get the time of the last event for the given account_id

In [48]:
last_event_time

Timestamp('2020-06-04 04:20:13.271000')

In [49]:
next_time_since_last_event = model.predict(pd.DataFrame({'time_since_last_event': [last_time_since_last_event]})) # Predict the time since the next event
next_time_since_last_event

array([1.587364e+09], dtype=float32)

In [50]:
pd.Timedelta(seconds=next_time_since_last_event[0])

Timedelta('18372 days 06:26:08')

In [51]:
next_event_time = last_event_time + pd.Timedelta(seconds=next_time_since_last_event[0]) # Calculate the time of the next event



In [52]:
print('Next event time for account', account_id, ':', next_event_time)

Next event time for account 3eff5d4b55889abb0a8b08d2136f1b63 : 2070-09-22 10:46:21.271000
