In [6]:
import polars as pl

df_pl = pl.read_csv('transactions_dataset.csv', separator=';')

In [2]:
# df = df_pl.with_columns(pl.col("date_order").str.to_date())
# # Count the number of transactions per client
# transaction_counts = df.group_by("client_id").agg(pl.count("client_id").alias("transaction_count"))

# # Filter out clients with less than 5 transactions
# filtered_clients = transaction_counts.filter(pl.col("transaction_count") >= 5)

# # Join the filtered clients back to the original dataframe
# df = df.join(filtered_clients, on="client_id", how="inner")

In [None]:
df = df_pl.with_columns(pl.col("date_order").str.to_date())
last_purchase_df = df.group_by("client_id").agg(pl.col("date_order").max().alias("Last_Purchase_Date"))

In [None]:
# Define thresholds
max_date = last_purchase_df["Last_Purchase_Date"].max()
churn_threshold = max_date - timedelta(days=180)
at_risk_threshold = max_date - timedelta(days=90)

# Ensure "Last_Purchase_Date" is of type Date
last_purchase_df = last_purchase_df.with_columns(
    pl.col("Last_Purchase_Date").cast(pl.Date)
)

# Add status column correctly
last_purchase_df = last_purchase_df.with_columns(
    pl.when(pl.col("Last_Purchase_Date") < churn_threshold)
    .then(pl.lit("churned"))
    .when(pl.col("Last_Purchase_Date") < at_risk_threshold)
    .then(pl.lit("at risk"))
    .otherwise(pl.lit("active"))
    .alias("status")
)

print(last_purchase_df)


shape: (170_590, 3)
┌───────────┬────────────────────┬─────────┐
│ client_id ┆ Last_Purchase_Date ┆ status  │
│ ---       ┆ ---                ┆ ---     │
│ i64       ┆ date               ┆ str     │
╞═══════════╪════════════════════╪═════════╡
│ 861740    ┆ 2019-08-16         ┆ active  │
│ 565357    ┆ 2019-08-16         ┆ active  │
│ 2054030   ┆ 2019-09-10         ┆ active  │
│ 2180803   ┆ 2019-08-08         ┆ active  │
│ 9876      ┆ 2018-07-31         ┆ churned │
│ …         ┆ …                  ┆ …       │
│ 591301    ┆ 2019-08-08         ┆ active  │
│ 41640     ┆ 2019-08-06         ┆ active  │
│ 1016493   ┆ 2019-03-04         ┆ churned │
│ 266577    ┆ 2019-08-12         ┆ active  │
│ 1565576   ┆ 2019-09-10         ┆ active  │
└───────────┴────────────────────┴─────────┘


In [8]:
import random
from datetime import datetime, timedelta

# Generate random dates
def random_date(start, end):
    return start + timedelta(days=random.randint(0, int((end - start).days)))

# Define date ranges
max_date = datetime.now()
churn_threshold = max_date - timedelta(days=180)
at_risk_threshold = max_date - timedelta(days=90)

# Generate fake data
clients = []
transactions = []

for i in range(10):
    client_id = f"client_{i+1}"
    if i < 3:
        status = "at risk"
        last_purchase_date = random_date(churn_threshold + timedelta(days=1), at_risk_threshold - timedelta(days=1))
    elif i < 6:
        status = "churned"
        last_purchase_date = random_date(datetime(2020, 1, 1), churn_threshold - timedelta(days=1))
    else:
        status = "active"
        last_purchase_date = random_date(at_risk_threshold + timedelta(days=1), max_date)

    clients.append((client_id, last_purchase_date, status))
    for _ in range(5):
        transaction_date = random_date(datetime(2020, 1, 1), last_purchase_date)
        amount = random.randint(100, 500)
        transactions.append((client_id, transaction_date, amount))

# Create dataframes
clients_df = pl.DataFrame(clients, schema=["client_id", "Last_Purchase_Date", "status"])
transactions_df = pl.DataFrame(transactions, schema=["client_id", "date_order", "sales_net"])

print(clients_df)
print(transactions_df)

shape: (10, 3)
┌───────────┬────────────────────────────┬─────────┐
│ client_id ┆ Last_Purchase_Date         ┆ status  │
│ ---       ┆ ---                        ┆ ---     │
│ str       ┆ datetime[μs]               ┆ str     │
╞═══════════╪════════════════════════════╪═════════╡
│ client_1  ┆ 2024-11-09 13:44:17.508311 ┆ at risk │
│ client_2  ┆ 2024-10-30 13:44:17.508311 ┆ at risk │
│ client_3  ┆ 2024-10-08 13:44:17.508311 ┆ at risk │
│ client_4  ┆ 2023-02-26 00:00:00        ┆ churned │
│ client_5  ┆ 2021-06-19 00:00:00        ┆ churned │
│ client_6  ┆ 2023-07-15 00:00:00        ┆ churned │
│ client_7  ┆ 2024-12-24 13:44:17.508311 ┆ active  │
│ client_8  ┆ 2025-01-22 13:44:17.508311 ┆ active  │
│ client_9  ┆ 2025-01-12 13:44:17.508311 ┆ active  │
│ client_10 ┆ 2024-12-01 13:44:17.508311 ┆ active  │
└───────────┴────────────────────────────┴─────────┘
shape: (50, 3)
┌───────────┬─────────────────────┬───────────┐
│ client_id ┆ date_order          ┆ sales_net │
│ ---       ┆ ---         

  clients_df = pl.DataFrame(clients, schema=["client_id", "Last_Purchase_Date", "status"])
  transactions_df = pl.DataFrame(transactions, schema=["client_id", "date_order", "sales_net"])


In [10]:
# Sort transactions by date
transactions_df = transactions_df.sort("date_order")

# Save the dataframes to CSV files
clients_df.write_csv('clients.csv')
transactions_df.write_csv('transactions.csv')