## Installing Kaggle API

In [1]:
!pip install kaggle



## Configuring Kaggle Credentials

Setup kaggle API correctly following https://www.kaggle.com/docs/api
```
%%shell
mkdir ~/.kaggle
echo \{\"username\":\"{your kaggle username}\",\"key\":\"{your kaggle api key}\"\} > ~/.kaggle/kaggle.json
pip install kaggle
```

In [2]:
%%shell
mkdir ~/.kaggle
echo \{\"username\":\"felipetufaile\",\"key\":\"264f0fc97e3bca1400ff73343a0319fd\"\} > ~/.kaggle/kaggle.json
pip install kaggle





## Installing PySpark

In [3]:
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
#Check this site for the latest download link https://www.apache.org/dyn/closer.lua/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!wget -q https://dlcdn.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!tar xf spark-3.2.1-bin-hadoop3.2.tgz
!pip install -q findspark
!pip install pyspark
!pip install py4j

import os
import sys
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
# os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop3.2"


import findspark
findspark.init()
findspark.find()

Get:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [859 kB]
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [55.6 kB]
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:8 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Get:9 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [2,104 kB]
Get:10 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]
Get:11 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:12 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease [24.3 kB]
Ge

'/usr/local/lib/python3.10/dist-packages/pyspark'

## Loading Libraries

In [4]:
# Importing Numpy library
import numpy as np

# Importing Pandas library
import pandas as pd

# Importing datetime library
from datetime import datetime, timedelta

# importing the zipfile module
from zipfile import ZipFile

# Importing gzip library
import gzip

# Importing plotying libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Importing userdata from Google Colab Library
from google.colab import userdata

# Importing pyspark libraries
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType, TimestampType
import pyspark.sql.functions as f
from pyspark.sql.window import Window

# Creating a spark session
spark = SparkSession.builder.appName("DataProcessingPySpark").getOrCreate()

spark

## Downloading Acquire Valued Shoppers Data

In [5]:
!kaggle competitions download -c customer-spend-model

Downloading customer-spend-model.zip to /content
 89% 2.00M/2.24M [00:00<00:00, 2.89MB/s]
100% 2.24M/2.24M [00:00<00:00, 2.78MB/s]


## Unzipping the Transactions Dataset

In [6]:
# loading the customer-spend-model.zip and creating a zip object
with ZipFile("../content/customer-spend-model.zip", 'r') as zip_object:

	# Extracting specific file in the zip into a specific location.
	zip_object.extract("customer.csv", path="../content")

 	# Extracting specific file in the zip into a specific location.
	zip_object.extract("orders.csv", path="../content")

# closing object
zip_object.close()

## Processing Data Using PySpark

In [8]:
# Reading orders dataframe
orders_df = spark.read.format("csv").option("header", "true").load("../content/orders.csv")

# Show orders dataframe
orders_df.show()

+---+---------+------+--------+---+------------+
| id|  orddate|ordnum|category|qty|       price|
+---+---------+------+--------+---+------------+
|957|10FEB2008| 38650|      35|  1|5.0106582642|
|957|10FEB2008| 38650|      35|  1|20.426101685|
|957|10FEB2008| 38650|      19|  1|20.400543213|
|957|15MAR2008| 48972|      40|  1|25.539016724|
|957|22NOV2008|150011|      40|  1|14.316169739|
|957|22NOV2008|150011|      40|  1|8.5896987915|
|957|03OCT2009|286151|      19|  1|15.313186646|
|957|04APR2010|376779|      14|  1|12.782295227|
|957|04APR2010|376779|      14|  1|5.0873527527|
|957|04APR2010|376779|      35|  1|6.5445327759|
|957|14AUG2011|622093|      99|  1|8.6919555664|
|957|14AUG2011|622093|      19|  1|10.174705505|
|957|14AUG2011|622093|       5|  1|15.236495972|
|957|10SEP2011|639810|      99|  1|9.9497375488|
|957|10SEP2011|639810|      35|  1|10.200271606|
|957|10SEP2011|639810|      99|  1|6.5445327759|
|957|10SEP2011|639810|      99|  1|2.5564575195|
|957|10SEP2011|63981

In [19]:
# Merging the customers and orders dataframes by customer id
trans_df = (

    # Referencing orders dataframe
    orders_df

    # Converting date format
    .withColumn("trans_dt", f.date_format(f.to_date(f.col("orddate"), "ddMMMyyyy"), 'yyyy-MM-dd'))

    # Selecting columns
    .select(
        f.col("id").alias("cust_id"),
        f.col("trans_dt"),
        #f.col("category"),                 # Will not be used right now
        #f.col("qty").alias("trans_qnt"),   # Will not be used right now
        f.col("price").alias("trans_amt")
    )

    # Consider only purchase amount higher than 0
    .filter(f.col("trans_amt") > 0)

    # Agregating transaction date
    .groupBy(['cust_id', 'trans_dt'])
    .agg(f.sum("trans_amt").alias("trans_amt"))

    # Ordering by customer id and transaction date
    .orderBy(f.asc("cust_id"), f.asc("trans_dt"))

).cache()

trans_df.show()

+--------+----------+------------------+
| cust_id|  trans_dt|         trans_amt|
+--------+----------+------------------+
|  100021|2009-02-25|      6.6212272644|
|  100021|2010-01-20|     33.0805740352|
|  100021|2010-03-06|     45.8117370608|
|  100021|2010-04-14|        50.1065979|
|  100021|2010-05-10|27.047328949399997|
|  100021|2010-08-28|      50.899093628|
|  100021|2010-09-26|      15.287620544|
|10005188|2012-07-28|30.451723098600002|
|10005188|2012-08-26|     45.2999877928|
|10005188|2013-05-08|24.799987792899998|
|10005188|2013-09-22|42.849990844000004|
|10005188|2014-06-15|      19.799987793|
|10005188|2014-10-04|     19.8999938964|
|10009396|2012-08-07|     24.9506454467|
|10009396|2013-01-05|124.44995498579999|
|10009396|2013-01-13|      64.649982452|
|10009396|2013-10-07|      69.900253295|
|10009396|2014-06-01| 68.49997520360002|
|10009396|2014-07-13| 49.69998168920001|
|10009604|2013-05-04|     14.8999938965|
+--------+----------+------------------+
only showing top

## Creating a Cross-Join Dataframe
Next we will create a cross-join dataframe between transaction date and customer id. The result of this process will be a dataframe with one record per day per customer id.

In [20]:
# Calculating the first transaction date accross all customers
start_date = trans_df.agg({"trans_dt": "min"}).withColumnRenamed("min(trans_dt)", "date").collect()[0].date

print(f"First transaction date across all customers: {start_date}")

First transaction date across all customers: 2007-11-04


In [21]:
# Calculating the last transaction date accross all customers
end_date = trans_df.agg({"trans_dt": "max"}).withColumnRenamed("max(trans_dt)", "date").collect()[0].date

print(f"Last transaction date across all customers: {end_date}")

Last transaction date across all customers: 2014-11-24


In [22]:
# Create a PySpark calendar dataframe with month-start frequency
calendar_df = (
  spark.createDataFrame(pd.DataFrame({'ref_dt': pd.date_range(start=start_date, end=end_date, freq='D')}))
  .withColumn("ref_dt", f.to_date("ref_dt"))
).cache()

In [23]:
# Creating a customer id dataframe
customers_id_df = (
    trans_df
    .select(f.col('cust_id'))
    .distinct()
).cache()

In [53]:
# Creating a customer lifetime value dataframe
cltv_df = (

    # Referencing the calendar table
    calendar_df

    # Performing a crossjoin with distinct customer ids and distinct transaction dates.
    # This should result in one record per month/year per customer
    .crossJoin(customers_id_df)

    # Adding aggregated transaction information about customers
    .join(
        trans_df
        .withColumn("ref_dt", f.col("trans_dt")),
        on=["cust_id", "ref_dt"],
        how="left"
    )

    # Calculating the first transaction date per customer
    .withColumn("first_trans_dt", f.first("trans_dt", ignorenulls=True).over(Window.partitionBy(["cust_id"]).orderBy(f.asc("ref_dt")).rowsBetween(Window.unboundedPreceding, 0)))

    # Selecting only the dates after the first transaction date for each customer
    .filter(f.col("ref_dt") >= f.col("first_trans_dt"))

    # Calculating last transaction date columns: overwriting null values
    .withColumn("last_trans_dt", f.last("trans_dt", ignorenulls=True).over(Window.partitionBy(["cust_id"]).orderBy(f.asc("ref_dt")).rowsBetween(Window.unboundedPreceding, 0)))

    # Calculating first purchase amount
    .withColumn("first_trans_amt", f.first("trans_amt", ignorenulls=True).over(Window.partitionBy(["cust_id"]).orderBy(f.asc("ref_dt")).rowsBetween(Window.unboundedPreceding, 0)))

    # Calculating account age: the number of days since the first purchase
    .withColumn("lifetime", f.datediff(f.col("ref_dt"), f.col("first_trans_dt")))

    # Calculating recency: the number of days since the last transaction date
    .withColumn("recency", f.datediff(f.col("ref_dt"), f.col("last_trans_dt")))

    # Trucating reference date to month-start
    .withColumn("ref_dt", f.trunc(f.col("ref_dt"), "month"))

    # Aggregating RFML data
    .groupBy(["cust_id", "ref_dt", "first_trans_amt"])
    .agg(
       f.max("recency").alias("recency"),
       f.sum("trans_amt").alias("trans_amt"),
       f.mean("lifetime").alias("lifetime"),
    )

    # Adjusting the transaction amount column
    .withColumn("trans_amt", f.when(f.col("trans_amt") > 0, f.col("trans_amt")).otherwise(f.lit(0)))

    # Creating a transaction indicator
    .withColumn("trans_ind", f.when(f.col("trans_amt") > 0, f.lit(1)).otherwise(f.lit(0)))

    # Calculating Frequency: total number of transactions until the current date
    .withColumn("frequency", f.sum("trans_ind").over(Window.partitionBy(["cust_id"]).orderBy(f.asc("ref_dt")).rowsBetween(Window.unboundedPreceding, 0)))

    # Calculating Monetary: total purchase amount until the current date
    .withColumn("monetary", f.sum("trans_amt").over(Window.partitionBy(["cust_id"]).orderBy(f.asc("ref_dt")).rowsBetween(Window.unboundedPreceding, 0)))

    # Calculating transaction rate: the average number of transactions per day
    .withColumn("trans_rate", f.when(f.col("lifetime") == 0, f.lit(1)).otherwise(f.col("frequency") / f.col("lifetime")))

    # Calculating rolling purchase quantity
    .withColumn("trans_qnt_R06m",        f.sum("trans_ind").over(Window.partitionBy(["cust_id"]).orderBy(f.asc("ref_dt")).rowsBetween(-5, 0)))
    .withColumn("trans_qnt_R06m_lag1",  f.sum("trans_ind").over(Window.partitionBy(["cust_id"]).orderBy(f.asc("ref_dt")).rowsBetween(-11, -5)))
    .withColumn("trans_qnt_R06m_lag2",  f.sum("trans_ind").over(Window.partitionBy(["cust_id"]).orderBy(f.asc("ref_dt")).rowsBetween(-17, -11)))

    # Calculating rolling purchase amount
    .withColumn("trans_amt_R06m",       f.sum("trans_amt").over(Window.partitionBy(["cust_id"]).orderBy(f.asc("ref_dt")).rowsBetween(-5, 0)))
    .withColumn("trans_amt_R06m_lag1",  f.sum("trans_amt").over(Window.partitionBy(["cust_id"]).orderBy(f.asc("ref_dt")).rowsBetween(-11, -5)))
    .withColumn("trans_amt_R06m_lag2",  f.sum("trans_amt").over(Window.partitionBy(["cust_id"]).orderBy(f.asc("ref_dt")).rowsBetween(-17, -11)))

    # Calculating rolling purchase quantity value 30 days moving average
    .withColumn("trans_qnt_R06m_mov_avg", f.mean("trans_qnt_R06m").over(Window.partitionBy(["cust_id"]).orderBy(f.asc("ref_dt")).rowsBetween(Window.unboundedPreceding, 0)))

    # Calculating rolling purchase amount value 30 days moving average
    .withColumn("trans_amt_R06m_mov_avg", f.mean("trans_amt_R06m").over(Window.partitionBy(["cust_id"]).orderBy(f.asc("ref_dt")).rowsBetween(Window.unboundedPreceding, 0)))

    # Fill missing values with -1
    .fillna(-1)

    # Ordering and selecting columns
    .select(
      "cust_id",
      "ref_dt",
      "first_trans_amt",
      "trans_rate",
      "recency",
      "frequency",
      "monetary",
      "lifetime",
      "trans_qnt_R06m",
      "trans_qnt_R06m_lag1",
      "trans_qnt_R06m_lag2",
      "trans_qnt_R06m_mov_avg",
      "trans_amt_R06m",
      "trans_amt_R06m_lag1",
      "trans_amt_R06m_lag2",
      "trans_amt_R06m_mov_avg"
    )

    # Adding train \ test information
    .join(spark.read.format("csv").option("header", "true").load("../content/customer.csv").withColumnRenamed("id", "cust_id"), how="left", on=["cust_id"])

    # Ordering by customer id and transaction date
    .orderBy(f.asc("cust_id"), f.asc("ref_dt"))

).cache()

print(f"The dataframe has: {trans_cross_df.count()} rows")

The dataframe has: 749989 rows


In [55]:
# Printing a sample of the dataset
cltv_df.show()

+-------+----------+---------------+--------------------+-------+---------+------------------+--------+--------------+-------------------+-------------------+----------------------+------------------+-------------------+-------------------+----------------------+-----+-------+
|cust_id|    ref_dt|first_trans_amt|          trans_rate|recency|frequency|          monetary|lifetime|trans_qnt_R06m|trans_qnt_R06m_lag1|trans_qnt_R06m_lag2|trans_qnt_R06m_mov_avg|    trans_amt_R06m|trans_amt_R06m_lag1|trans_amt_R06m_lag2|trans_amt_R06m_mov_avg|train|logtarg|
+-------+----------+---------------+--------------------+-------+---------+------------------+--------+--------------+-------------------+-------------------+----------------------+------------------+-------------------+-------------------+----------------------+-----+-------+
| 100021|2009-02-01|   6.6212272644|  0.6666666666666666|      3|        1|      6.6212272644|     1.5|             1|                 -1|                 -1|        

## Storing the Processed Dataset as Parquet Files

In [56]:
# Defining dataframe path
file_path = "../content/drive/MyDrive/Colab/Sandbox/customer_spend_model"

# Storing Customer Lifetime Value Data
cltv_df.write.format("parquet").mode("overwrite").option("overwriteSchema", "true").save(file_path)