In [0]:
# we will have to install kagglehub
%pip install kagglehub

In [0]:
%restart_python

In [0]:
# first we import the liberires
from pyspark.sql.functions import col
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
import mlflow
import mlflow.spark
import kagglehub

In [0]:
# now we load data
df = spark.read.csv('/kaggle/input/credit-card-customers/BankChurners.csv')

In [0]:
df.printSchema()
df.show()

In [0]:
data = df.select("Geography", "Gender", "Age", "CreditScore", "Balance", "Exited")
#In this command we are selecting the columns we want to use for our model

In [0]:
Geo_indexer = StringIndexer(inputCol="Geography", outputCol="GeographyIndex")
Gender_indexer = StringIndexer(inputCol="Gender", outputCol="GenderIndex")

# In this command we are indexing the columns Geography and Gender

In [0]:
# Now we use assembler to 
assembler = VectorAssembler(inputCols=["GeographyIndex", "GenderIndex", "Age", "CreditScore", "Balance"], outputCol=["feature"])

In [0]:
lr = LogisticRegression(featuresCol="feature", labelCol="Exited")

In [0]:
pipeline = Pipeline(stages=[Geo_indexer, Gender_indexer, assembler, lr])

In [0]:
train, test = data.randomSplit([0.8, 0.2], seed=42)

In [0]:
with mlflow.start_run():
    model = pipeline.fit(train)
    predictions = model.transform(test)
    evaluator  = BinaryClassificationEvaluator(labelCol="Exited")
    auc = evaluator.evaluate(predictions)
    mlflow.log_metrics("AUC", auc)
    mlflow.spark.log_model(model, "logistic_churn_model")

    print(f"AUC = {auc}")



In [0]:
predictions.select("prediction", "Exited", "Geography", "Gender", "Age", "CreditScore", "Balance").show(10)