# AI PROJECT - CHURN PREDICTION

#### Purpose of the task in hand is to create a machine learning model that will be able to predict churn.
   Customer churn: a buyer who fails to place any new order within a chosen inactivity window (e.g., 90-180 days) after their last purchase date.


In [2]:
# --------Starting project with git-------------

# %cd "/content/drive/MyDrive/Colab Notebooks/Final Project/Churn Prediction"
# !pwd
# !git init
# !echo -e ".ipynb_checkpoints/\nsample_data/\ndrive/\n.config/" > .gitignore
# !git add .
# !git commit -m "Initial commit: Churn Prediction project"


In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, LongType, IntegerType, StringType

# Start Spark session
spark = SparkSession.builder.getOrCreate()

# Define schemas
events_schema = StructType([
    StructField("timestamp", LongType(), True),
    StructField("visitorid", IntegerType(), True),
    StructField("event", StringType(), True),
    StructField("itemid", IntegerType(), True),
    StructField("transactionid", IntegerType(), True)
])

item_properties_schema = StructType([
    StructField("timestamp", LongType(), True),
    StructField("itemid", IntegerType(), True),
    StructField("property", StringType(), True),
    StructField("value", StringType(), True)
])

category_tree_schema = StructType([
    StructField("categoryid", IntegerType(), True),
    StructField("parentid", IntegerType(), True)
])

# Load data
events = spark.read.csv(
    "/content/drive/MyDrive/Colab Notebooks/Final Project/Churn Prediction/Data/events.csv",
    header=True,
    schema=events_schema
)

items_properties_part1 = spark.read.csv(
    "/content/drive/MyDrive/Colab Notebooks/Final Project/Churn Prediction/Data/item_properties_part1.csv",
    header=True,
    schema=item_properties_schema
)

items_properties_part2 = spark.read.csv(
    "/content/drive/MyDrive/Colab Notebooks/Final Project/Churn Prediction/Data/item_properties_part2.csv",
    header=True,
    schema=item_properties_schema
)

category_tree = spark.read.csv(
    "/content/drive/MyDrive/Colab Notebooks/Final Project/Churn Prediction/Data/category_tree.csv",
    header=True,
    schema=category_tree_schema
)


In [15]:
items_properties_part1.show(1) # where property is 790 it means that is value columns represnts price

+-------------+------+----------+-----+
|    timestamp|itemid|  property|value|
+-------------+------+----------+-----+
|1435460400000|460429|categoryid| 1338|
+-------------+------+----------+-----+
only showing top 1 row



# Data Cleansing and basic feature engineering, preparing for EDA


In [16]:
from pyspark.sql.functions import col, to_timestamp, to_date, date_format

# Convert from Unix timestamp to actual TIMESTAMP type
events = events.withColumn("ts", to_timestamp(col("timestamp").cast("double")))

from pyspark.sql.functions import col, to_timestamp, to_date, date_format

events = events.withColumn("ts", to_timestamp((col("timestamp") / 1000).cast("double"))) \
               .withColumn("create_date", to_date(col("ts"))) \
               .withColumn("create_time", date_format(col("ts"), "HH:mm:ss"))


In [17]:
items_properties_part1 = items_properties_part1.withColumn("ts", to_timestamp(col("timestamp").cast("double")))
items_properties_part1 = items_properties_part1.withColumn("ts", to_timestamp((col("timestamp") / 1000).cast("double"))) \
               .withColumn("create_date", to_date(col("ts"))) \
               .withColumn("create_time", date_format(col("ts"), "HH:mm:ss"))

In [18]:
items_properties_part2 = items_properties_part1.withColumn("ts", to_timestamp(col("timestamp").cast("double")))
items_properties_part2 = items_properties_part1.withColumn("ts", to_timestamp((col("timestamp") / 1000).cast("double"))) \
               .withColumn("create_date", to_date(col("ts"))) \
               .withColumn("create_time", date_format(col("ts"), "HH:mm:ss"))

In [21]:
category_tree.show(1)

+----------+--------+
|categoryid|parentid|
+----------+--------+
|      1016|     213|
+----------+--------+
only showing top 1 row



In [25]:
from pyspark.sql.functions import col

# Keep only price, categoryid, and available
items_properties_part1 = items_properties_part1.filter(
    col("property").isin(["790", "categoryid", "available"])
)

items_properties_part2 = items_properties_part2.filter(
    col("property").isin(["790", "categoryid", "available"])
)

# Rename '790' to 'price'
items_properties_part1 = items_properties_part1.withColumn(
    "property",
    f.when(col("property") == "790", "price").otherwise(col("property"))
)

items_properties_part2 = items_properties_part2.withColumn(
    "property",
    f.when(col("property") == "790", "price").otherwise(col("property"))
)


In [26]:
from pyspark.sql.functions import regexp_replace, col
items_properties_part1 = items_properties_part1.withColumn("value", regexp_replace(col("value"), "[^0-9.]", "").cast("float"))
items_properties_part2 = items_properties_part2.withColumn("value", regexp_replace(col("value"), "[^0-9.]", "").cast("float"))


# Exploratory Data Analysis (EDA)
Objectives:
- Investigate content, datatypes, null values
- Understand user behavior, temporal patterns, and business dynamics.

In [47]:
# events.show(2)
# events.printSchema()
events.groupBy("event").count().orderBy("count", ascending=False).show()

+-------------+---------+-----+------+-------------+--------------------+-----------+-----------+
|    timestamp|visitorid|event|itemid|transactionid|                  ts|create_date|create_time|
+-------------+---------+-----+------+-------------+--------------------+-----------+-----------+
|1433221332117|   257597| view|355908|         NULL|2015-06-02 05:02:...| 2015-06-02|   05:02:12|
|1433224214164|   992329| view|248676|         NULL|2015-06-02 05:50:...| 2015-06-02|   05:50:14|
+-------------+---------+-----+------+-------------+--------------------+-----------+-----------+
only showing top 2 rows

+-----------+-------+
|      event|  count|
+-----------+-------+
|       view|2664312|
|  addtocart|  69332|
|transaction|  22457|
+-----------+-------+



In [53]:
from pyspark.sql.functions import col, sum as _sum

events.select([_sum(col(c).isNull().cast("int")).alias(c) for c in events.columns]).show() # expected as some action are not transaction


+---------+---------+-----+------+-------------+---+-----------+-----------+
|timestamp|visitorid|event|itemid|transactionid| ts|create_date|create_time|
+---------+---------+-----+------+-------------+---+-----------+-----------+
|        0|        0|    0|     0|      2733644|  0|          0|          0|
+---------+---------+-----+------+-------------+---+-----------+-----------+



In [48]:
#items_properties_part1.show(1)
items_properties_part1.filter(items_properties_part1.itemid == 355908).show(truncate=False)
items_properties_part1.count()

+-------------+------+----------+-------+-------------------+-----------+-----------+
|timestamp    |itemid|property  |value  |ts                 |create_date|create_time|
+-------------+------+----------+-------+-------------------+-----------+-----------+
|1431831600000|355908|available |1.0    |2015-05-17 03:00:00|2015-05-17 |03:00:00   |
|1440903600000|355908|available |1.0    |2015-08-30 03:00:00|2015-08-30 |03:00:00   |
|1436065200000|355908|available |1.0    |2015-07-05 03:00:00|2015-07-05 |03:00:00   |
|1437879600000|355908|available |1.0    |2015-07-26 03:00:00|2015-07-26 |03:00:00   |
|1431226800000|355908|categoryid|1173.0 |2015-05-10 03:00:00|2015-05-10 |03:00:00   |
|1431226800000|355908|price     |94080.0|2015-05-10 03:00:00|2015-05-10 |03:00:00   |
|1431831600000|355908|price     |94080.0|2015-05-17 03:00:00|2015-05-17 |03:00:00   |
|1433646000000|355908|price     |89280.0|2015-06-07 03:00:00|2015-06-07 |03:00:00   |
|1434250800000|355908|price     |90480.0|2015-06-14 03

2214492

In [55]:
#items_properties_part1.select([_sum(col(c).isNull().cast("int")).alias(c) for c in items_properties_part1.columns]).show() # expected as some action are not transaction
items_properties_part2.select([_sum(col(c).isNull().cast("int")).alias(c) for c in items_properties_part2.columns]).show() # expected as some action are not transaction


+---------+------+--------+-----+---+-----------+-----------+
|timestamp|itemid|property|value| ts|create_date|create_time|
+---------+------+--------+-----+---+-----------+-----------+
|        0|     0|       0|    0|  0|          0|          0|
+---------+------+--------+-----+---+-----------+-----------+



In [34]:
category_tree.show(1) # each category falls under 1 parent id, so we can get further insight based on that

+----------+--------+
|categoryid|parentid|
+----------+--------+
|      1016|     213|
+----------+--------+
only showing top 1 row



In [56]:
category_tree.select([_sum(col(c).isNull().cast("int")).alias(c) for c in category_tree.columns]).show() # expected as some action are not transaction


+----------+--------+
|categoryid|parentid|
+----------+--------+
|         0|      25|
+----------+--------+



In [59]:
category_tree.filter(col("parentid").isNull()).show()
# these categoryid with parent id == null, it might mean they are parent id itself, hence null
# to be investigated...

+----------+--------+
|categoryid|parentid|
+----------+--------+
|       231|    NULL|
|       791|    NULL|
|      1490|    NULL|
|       431|    NULL|
|       755|    NULL|
|       378|    NULL|
|      1579|    NULL|
|      1394|    NULL|
|       659|    NULL|
|      1057|    NULL|
|       859|    NULL|
|       803|    NULL|
|       250|    NULL|
|      1452|    NULL|
|      1182|    NULL|
|      1692|    NULL|
|      1600|    NULL|
|      1482|    NULL|
|      1224|    NULL|
|      1532|    NULL|
+----------+--------+
only showing top 20 rows



In [57]:
# Data above seems sturctured well, and no nulls, however I will recheck after the joining of the tables, as I supsect I will get nulls than

fatal: not a git repository (or any of the parent directories): .git


In [61]:
%cd "/content/drive/MyDrive/Colab Notebooks/Final Project/Churn Prediction"


/content/drive/MyDrive/Colab Notebooks/Final Project/Churn Prediction


In [None]:
!g

# Feature Engineering