In [0]:
configs = {
    "fs.azure.account.auth.type": "CustomAccessToken",
    "fs.azure.account.custom.token.provider.class": spark.conf.get("spark.databricks.passthrough.adls.gen2.tokenProviderClassName")
}


mount_point = "/mnt/cleaned-data"

try:
    dbutils.fs.unmount(mount_point)
    # Optional: Catch the exception if the mount point does not exist
except Exception as e:
    print(f"Could not unmount {mount_point}: {e}")

# Now proceed with the mounting
dbutils.fs.mount(
    source = "abfss://cleaned-data@datalake0012anee.dfs.core.windows.net/",
    mount_point = mount_point,
    extra_configs = configs
)


/mnt/cleaned-data has been unmounted.


True

In [0]:
dbutils.fs.ls("/mnt/cleaned-data")

[FileInfo(path='dbfs:/mnt/cleaned-data/_SUCCESS', name='_SUCCESS', size=0, modificationTime=1710087341000),
 FileInfo(path='dbfs:/mnt/cleaned-data/_committed_4908096667950482827', name='_committed_4908096667950482827', size=123, modificationTime=1710087341000),
 FileInfo(path='dbfs:/mnt/cleaned-data/_started_4908096667950482827', name='_started_4908096667950482827', size=0, modificationTime=1710087340000),
 FileInfo(path='dbfs:/mnt/cleaned-data/part-00000-tid-4908096667950482827-a823fdb6-3441-400a-9750-78d8a0451478-65-1.c000.snappy.parquet', name='part-00000-tid-4908096667950482827-a823fdb6-3441-400a-9750-78d8a0451478-65-1.c000.snappy.parquet', size=3113195, modificationTime=1710087341000)]

In [0]:
input_path = "/mnt/cleaned-data/part-00000-tid-4908096667950482827-a823fdb6-3441-400a-9750-78d8a0451478-65-1.c000.snappy.parquet"

In [0]:
df = spark.read.format('parquet').options(header='True', inferSchema='True').load(input_path)

In [0]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

# Load your data 
df_cleaned = df

categoricalCols = ['gender', 'business_or_commercial', 'region', "age"]
numericCols = ["loan_amount", "rate_of_interest", "interest_rate_spread", "upfront_charges", "credit_score", "property_value", "ltv", "dtir1", "income",  "term"]


In [0]:
# Indexing categorical columns
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").setHandleInvalid("skip") for column in categoricalCols]

# Encoding categorical features
encoders = [OneHotEncoder(inputCol=indexer.getOutputCol(), outputCol=indexer.getOutputCol()+"_vec") for indexer in indexers]


In [0]:
# Assembling all features into a single vector
assemblerInputs = [encoder.getOutputCol() for encoder in encoders] + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

In [0]:
lr = LogisticRegression(featuresCol="features", labelCol="status")

In [0]:
pipeline = Pipeline(stages=indexers + encoders + [assembler, lr])

In [0]:
train_data, test_data = df_cleaned.randomSplit([0.7, 0.3], seed=42)

# Train the model
model = pipeline.fit(train_data)

# Make predictions
predictions = model.transform(test_data)

In [0]:
# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="status", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy * 100) + "%")

Test set accuracy = 75.58681239921161%
