Create your spark session

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("logistic_regression").getOrCreate()

####LOAD

In [0]:
%fs ls dbfs:/mnt/gold

In [0]:
# Read the Parquet file into a DataFrame
data = spark.read.format("parquet").load("dbfs:/mnt/gold/all_data_paraquet/")

In [0]:
data.show(5)

In [0]:
data.printSchema()

We will not be using the columns "InvoiceNo", "Description", "InvoiceDate", "customer_count" as features for Logistic Regression model

In [0]:
# Drop the unnecessary columns from the DataFrame
data = data.drop("InvoiceNo", "Description", "InvoiceDate", "customer_count")
data.show()

In [0]:
count = data.select("Country").distinct().count()
print(count)

Since there are 37 unique countries performing onehotencoding will massively increase the complexity of our dataset. So we will only be performing Indexing

In [0]:
data.describe().show()

In [0]:
from pyspark.ml.feature import (VectorAssembler,VectorIndexer, OneHotEncoder,StringIndexer)

In [0]:
data.printSchema()

In [0]:
indexer_country = StringIndexer(inputCol="Country", outputCol="CountryIndex", handleInvalid="skip")
indexer_stockcode = StringIndexer(inputCol="StockCode", outputCol="StockCodeIndex", handleInvalid="skip")

assembler=VectorAssembler(inputCols=[
 'CustomerID',
 'Quantity',
 'UnitPrice',
 'CountryIndex',
 'StockCodeIndex'],outputCol='features')

In [0]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

In [0]:
log_reg=LogisticRegression(featuresCol='features',labelCol='Flag_reorder')

In [0]:
pipeline=Pipeline(stages=[indexer_country,indexer_stockcode,assembler,log_reg])

In [0]:
train,test=data.randomSplit([0.7,.3])

In [0]:
model=pipeline.fit(train)

In [0]:
result=model.transform(test)
display(result.select('Flag_reorder','prediction'))

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
eval_re=BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='Flag_reorder')

In [0]:
AUC=eval_re.evaluate(result)
AUC