In [1]:
from pyspark.sql import SparkSession

In [2]:
import mlflow
import mlflow.spark
import pandas as pd

In [3]:
# Set our tracking server uri for logging
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

# Create a new MLflow Experiment
mlflow.set_experiment("MLflow Quickstart 2")

<Experiment: artifact_location='mlflow-artifacts:/377674707853028999', creation_time=1737431595708, experiment_id='377674707853028999', last_update_time=1737431595708, lifecycle_stage='active', name='MLflow Quickstart 2', tags={}>

In [4]:
spark = SparkSession.builder \
    .appName("Final") \
    .getOrCreate()

sc = spark.sparkContext

25/01/20 22:58:39 WARN Utils: Your hostname, mac.local resolves to a loopback address: 127.0.0.1; using 10.0.0.2 instead (on interface en0)
25/01/20 22:58:39 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/20 22:58:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
spark

In [6]:
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

In [7]:
df = spark.read.parquet('./cleaned_dataset')

In [8]:
df.printSchema()

root
 |-- lei: string (nullable = true)
 |-- loan_type: integer (nullable = true)
 |-- loan_purpose: integer (nullable = true)
 |-- loan_amount: float (nullable = true)
 |-- interest_rate: float (nullable = true)
 |-- loan_term: integer (nullable = true)
 |-- action_taken: integer (nullable = true)
 |-- income: float (nullable = true)
 |-- applicant_age: string (nullable = true)
 |-- applicant_sex: integer (nullable = true)
 |-- applicant_credit_score_type: integer (nullable = true)
 |-- co_applicant_age: string (nullable = true)
 |-- co_applicant_credit_score_type: integer (nullable = true)
 |-- derived_msa_md: integer (nullable = true)
 |-- state_code: string (nullable = true)
 |-- county_code: string (nullable = true)
 |-- property_value: float (nullable = true)
 |-- total_units: integer (nullable = true)
 |-- occupancy_type: integer (nullable = true)



In [9]:
df

lei,loan_type,loan_purpose,loan_amount,interest_rate,loan_term,action_taken,income,applicant_age,applicant_sex,applicant_credit_score_type,co_applicant_age,co_applicant_credit_score_type,derived_msa_md,state_code,county_code,property_value,total_units,occupancy_type
549300YIBRM1ZYA6GT07,1,4,65000.0,3.2904205,180,0,116.0,35-44,1,3,35-44,3,39300,RI,44005,485000.0,3,1
549300YIBRM1ZYA6GT07,1,31,75000.0,2.75,240,1,31.0,65-74,2,3,>74,3,39300,RI,44007,225000.0,1,1
549300YIBRM1ZYA6GT07,1,31,145000.0,3.125,240,1,40.0,55-64,2,3,9999,10,39300,RI,44007,235000.0,1,1
01J4SO3XTWZF4PP38209,1,2,5000.0,3.2904205,60,0,27.0,65-74,1,1,9999,10,32820,TN,47157,75000.0,1,1
01J4SO3XTWZF4PP38209,1,31,25000.0,5.5,120,1,187.0,35-44,2,7,35-44,9,25620,MS,28035,405000.0,1,1
01J4SO3XTWZF4PP38209,2,1,115000.0,3.99,360,0,94.0,8888,4,9,8888,9,99999,MS,28083,125000.0,1,1
01J4SO3XTWZF4PP38209,1,4,15000.0,0.98,120,1,58.0,55-64,2,7,65-74,9,25060,MS,28047,125000.0,2,1
01J4SO3XTWZF4PP38209,2,1,315000.0,2.875,360,1,87.0,25-34,1,1,9999,10,32820,MS,28137,325000.0,1,1
01J4SO3XTWZF4PP38209,1,1,465000.0,3.5,360,1,188.0,35-44,2,3,35-44,3,13820,AL,1117,495000.0,1,1
01J4SO3XTWZF4PP38209,1,32,105000.0,2.625,180,1,65.0,45-54,2,2,9999,10,25060,MS,28047,175000.0,1,1


In [10]:
len(df.columns)

19

In [11]:
df.count()

7427

### 4. Train Test split

In [12]:
train_df, test_df = df.randomSplit(weights=[0.8, 0.2], seed=100)

In [13]:
train_df.count()

5915

In [14]:
test_df.count()

1512

### 5. Feature transformation

In [15]:
from pyspark.sql.types import StringType

In [16]:
from pyspark.ml.feature import StringIndexer, StandardScaler, VectorAssembler, FeatureHasher, OneHotEncoder
from pyspark.ml import Pipeline

In [17]:
## Total columns(including target col) = 19

## Categorical columns 
## cat_1 columns (<10 ) ==> one hot encoding
cat_1_cols = ['loan_type', 'loan_purpose', 'applicant_sex', 'total_units', 'occupancy_type', 'applicant_age', 'co_applicant_age', 'applicant_credit_score_type', 'co_applicant_credit_score_type']
## cat_2 columns (high unique categories) ==> label encoding
cat_2_cols = ['state_code', 'county_code', 'derived_msa_md', 'lei']

## Numerical columns ==> Standard Scaler
num_features = ['loan_amount', 'interest_rate', 'loan_term', 'income', 'property_value']

In [18]:
# Perform transformation

cat_1_index_cols = [ c+"_index" for c in cat_1_cols]
cat_1_OHE_cols = [c+"_OHE" for c in cat_1_cols]

cat_1_stringIndexer = StringIndexer(inputCols=cat_1_cols, outputCols=cat_1_index_cols, handleInvalid="skip")
cat_1_OneHotEncoder = OneHotEncoder(inputCols=cat_1_index_cols, outputCols=cat_1_OHE_cols)

In [19]:
cat_2_index_cols = [ c+"_index" for c in cat_2_cols]

cat_2_stringIndexer = StringIndexer(inputCols=cat_2_cols, outputCols=cat_2_index_cols, handleInvalid="skip")

In [20]:
num_assembler = VectorAssembler(inputCols=num_features, outputCol="num_vector")
num_scaler = StandardScaler(inputCol='num_vector', outputCol='num_scaled_vector')

In [21]:
# assemble all the features together
X_assembler = VectorAssembler(inputCols=['num_scaled_vector'] + cat_1_OHE_cols + cat_2_index_cols, outputCol='features' )

In [22]:
# make the Pipeline
transformPipeline = Pipeline(stages = [cat_1_stringIndexer,cat_1_OneHotEncoder, cat_2_stringIndexer, num_assembler, num_scaler, X_assembler])

In [23]:
# train it
transformPipeModel = transformPipeline.fit(train_df)

In [24]:
# log pipeline
mlflow.spark.log_model(transformPipeModel, 'transform pipeline')

25/01/20 22:58:45 ERROR Instrumentation: org.apache.hadoop.fs.UnsupportedFileSystemException: No FileSystem for scheme "mlflow-artifacts"
	at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:3443)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3466)
	at org.apache.hadoop.fs.FileSystem.access$300(FileSystem.java:174)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3574)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3521)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:540)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:365)
	at org.apache.spark.ml.util.FileSystemOverwrite.handleOverwrite(ReadWrite.scala:673)
	at org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:167)
	at org.apache.spark.ml.PipelineModel$PipelineModelWriter.super$save(Pipeline.scala:344)
	at org.apache.spark.ml.PipelineModel$PipelineModelWriter.$anonfun$save$4(Pipeline.scala:344)
	at org.apache.spark.ml.MLEvents.withSaveInst

<mlflow.models.model.ModelInfo at 0x1303da420>

In [25]:
transformPipeModel.write().overwrite().save('./models/transformPipeModel')

In [26]:
train_df = transformPipeModel.transform(train_df)

In [27]:
train_df

25/01/20 22:58:55 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


lei,loan_type,loan_purpose,loan_amount,interest_rate,loan_term,action_taken,income,applicant_age,applicant_sex,applicant_credit_score_type,co_applicant_age,co_applicant_credit_score_type,derived_msa_md,state_code,county_code,property_value,total_units,occupancy_type,loan_type_index,loan_purpose_index,applicant_sex_index,total_units_index,occupancy_type_index,applicant_age_index,co_applicant_age_index,applicant_credit_score_type_index,co_applicant_credit_score_type_index,loan_type_OHE,loan_purpose_OHE,applicant_sex_OHE,total_units_OHE,occupancy_type_OHE,applicant_age_OHE,co_applicant_age_OHE,applicant_credit_score_type_OHE,co_applicant_credit_score_type_OHE,state_code_index,county_code_index,derived_msa_md_index,lei_index,num_vector,num_scaled_vector,features
01J4SO3XTWZF4PP38209,1,1,465000.0,3.5,360,1,188.0,35-44,2,3,35-44,3,13820,AL,1117,495000.0,1,1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,2.0,4.0,"(3,[0],[1.0])","(5,[1],[1.0])","(4,[1],[1.0])","(3,[0],[1.0])","(2,[0],[1.0])","(7,[0],[1.0])","(8,[1],[1.0])","(8,[2],[1.0])","(9,[4],[1.0])",26.0,146.0,67.0,93.0,"[465000.0,3.5,360...",[2.04958786384093...,"(58,[0,1,2,3,4,5,..."
01J4SO3XTWZF4PP38209,1,2,5000.0,3.2904205,60,0,27.0,65-74,1,1,9999,10,32820,TN,47157,75000.0,1,1,0.0,3.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,"(3,[0],[1.0])","(5,[3],[1.0])","(4,[0],[1.0])","(3,[0],[1.0])","(2,[0],[1.0])","(7,[4],[1.0])","(8,[0],[1.0])","(8,[0],[1.0])","(9,[0],[1.0])",20.0,100.0,66.0,93.0,"[5000.0,3.2904205...",[0.02203857918108...,"(58,[0,1,2,3,4,5,..."
01J4SO3XTWZF4PP38209,1,4,15000.0,0.98,120,1,58.0,55-64,2,7,65-74,9,25060,MS,28047,125000.0,2,1,0.0,4.0,1.0,1.0,0.0,3.0,6.0,5.0,1.0,"(3,[0],[1.0])","(5,[4],[1.0])","(4,[1],[1.0])","(3,[1],[1.0])","(2,[0],[1.0])","(7,[3],[1.0])","(8,[6],[1.0])","(8,[5],[1.0])","(9,[1],[1.0])",36.0,284.0,174.0,93.0,"[15000.0,0.980000...",[0.06611573754325...,"(58,[0,1,2,3,4,5,..."
01J4SO3XTWZF4PP38209,1,31,25000.0,5.5,120,1,187.0,35-44,2,7,35-44,9,25620,MS,28035,405000.0,1,1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,5.0,1.0,"(3,[0],[1.0])","(5,[0],[1.0])","(4,[1],[1.0])","(3,[0],[1.0])","(2,[0],[1.0])","(7,[0],[1.0])","(8,[1],[1.0])","(8,[5],[1.0])","(9,[1],[1.0])",36.0,441.0,225.0,93.0,"[25000.0,5.5,120....",[0.11019289590542...,"(58,[0,1,2,3,4,5,..."
01J4SO3XTWZF4PP38209,1,31,45000.0,4.25,59,1,94.0,8888,4,9,9999,10,27140,MS,28049,75000.0,1,3,0.0,0.0,2.0,0.0,1.0,5.0,0.0,3.0,0.0,"(3,[0],[1.0])","(5,[0],[1.0])","(4,[2],[1.0])","(3,[0],[1.0])","(2,[1],[1.0])","(7,[5],[1.0])","(8,[0],[1.0])","(8,[3],[1.0])","(9,[0],[1.0])",36.0,914.0,179.0,93.0,"[45000.0,4.25,59....",[0.19834721262976...,"(58,[0,1,2,3,4,5,..."
01J4SO3XTWZF4PP38209,1,31,205000.0,3.0,360,1,97.0,65-74,1,1,65-74,1,19300,AL,1003,375000.0,1,1,0.0,0.0,0.0,0.0,0.0,4.0,6.0,0.0,2.0,"(3,[0],[1.0])","(5,[0],[1.0])","(4,[0],[1.0])","(3,[0],[1.0])","(2,[0],[1.0])","(7,[4],[1.0])","(8,[6],[1.0])","(8,[0],[1.0])","(9,[2],[1.0])",26.0,145.0,126.0,93.0,"[205000.0,3.0,360...",[0.90358174642449...,"(58,[0,1,2,3,4,5,..."
01J4SO3XTWZF4PP38209,1,32,105000.0,2.625,180,1,65.0,45-54,2,2,9999,10,25060,MS,28047,175000.0,1,1,0.0,2.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,"(3,[0],[1.0])","(5,[2],[1.0])","(4,[1],[1.0])","(3,[0],[1.0])","(2,[0],[1.0])","(7,[1],[1.0])","(8,[0],[1.0])","(8,[1],[1.0])","(9,[0],[1.0])",36.0,284.0,174.0,93.0,"[105000.0,2.625,1...",[0.46281016280279...,"(58,[0,1,2,3,4,5,..."
01J4SO3XTWZF4PP38209,2,1,115000.0,3.99,360,0,94.0,8888,4,9,8888,9,99999,MS,28083,125000.0,1,1,1.0,1.0,2.0,0.0,0.0,5.0,4.0,3.0,1.0,"(3,[1],[1.0])","(5,[1],[1.0])","(4,[2],[1.0])","(3,[0],[1.0])","(2,[0],[1.0])","(7,[5],[1.0])","(8,[4],[1.0])","(8,[3],[1.0])","(9,[1],[1.0])",36.0,918.0,0.0,93.0,"[115000.0,3.99000...",[0.50688732116496...,"(58,[0,1,2,3,4,6,..."
01J4SO3XTWZF4PP38209,2,1,315000.0,2.875,360,1,87.0,25-34,1,1,9999,10,32820,MS,28137,325000.0,1,1,1.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,"(3,[1],[1.0])","(5,[1],[1.0])","(4,[0],[1.0])","(3,[0],[1.0])","(2,[0],[1.0])","(7,[2],[1.0])","(8,[0],[1.0])","(8,[0],[1.0])","(9,[0],[1.0])",36.0,443.0,66.0,93.0,"[315000.0,2.875,3...",[1.38843048840837...,"(58,[0,1,2,3,4,6,..."
0S8H5NJFLHEVJXVTQ413,1,32,285000.0,3.5,360,1,64.0,25-34,3,1,25-34,1,13380,WA,53073,355000.0,1,1,0.0,2.0,3.0,0.0,0.0,2.0,3.0,0.0,2.0,"(3,[0],[1.0])","(5,[2],[1.0])","(4,[3],[1.0])","(3,[0],[1.0])","(2,[0],[1.0])","(7,[2],[1.0])","(8,[3],[1.0])","(8,[0],[1.0])","(9,[2],[1.0])",10.0,221.0,153.0,577.0,"[285000.0,3.5,360...",[1.25619901332186...,"(58,[0,1,2,3,4,5,..."


### 6. Models Prediction

In [28]:
from pyspark.ml.classification import LogisticRegression, LinearSVC, NaiveBayes, DecisionTreeClassifier, RandomForestClassifier, GBTClassifier, FMClassifier

In [29]:
# evaluate the model from 'y_true' and 'y_pred' 
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

In [30]:
def evaluate_model(predictions, label_col='action_taken', prediction_col='prediction', raw_prediction_col='rawPrediction'):
    '''It returns classification evaluation metrics like accuracy, precision, f1, recall and roc'''
    
    # Initialize evaluators
    evaluator_accuracy = MulticlassClassificationEvaluator(labelCol=label_col, predictionCol=prediction_col, metricName='accuracy')
    evaluator_precision = MulticlassClassificationEvaluator(labelCol=label_col, predictionCol=prediction_col, metricName='weightedPrecision')
    evaluator_recall = MulticlassClassificationEvaluator(labelCol=label_col, predictionCol=prediction_col, metricName='weightedRecall')
    evaluator_f1 = MulticlassClassificationEvaluator(labelCol=label_col, predictionCol=prediction_col, metricName='f1')
    evaluator_roc = BinaryClassificationEvaluator(labelCol=label_col, rawPredictionCol=raw_prediction_col, metricName='areaUnderROC')

    # Calculate metrics
    accuracy = evaluator_accuracy.evaluate(predictions)
    precision = evaluator_precision.evaluate(predictions)
    recall = evaluator_recall.evaluate(predictions)
    f1_score = evaluator_f1.evaluate(predictions)
    roc_auc = evaluator_roc.evaluate(predictions)

    # Return all metrics as a dictionary
    metrics = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score,
        'roc_auc': roc_auc
    }

    return metrics

In [31]:
models = {
    'Logistic Regression': LogisticRegression(featuresCol='features', labelCol='action_taken'),
    'Support Vector Machine': LinearSVC(featuresCol='features', labelCol='action_taken'),
    # 'Naive Bayes': NaiveBayes(featuresCol='features', labelCol='action_taken'),
    'Factorization Machine': FMClassifier(featuresCol='features', labelCol='action_taken'),
    'Decision Tree': DecisionTreeClassifier(featuresCol='features', labelCol='action_taken', maxBins=2000),
    'Random Forest': RandomForestClassifier(featuresCol='features', labelCol='action_taken', maxBins=2000),
    'Gradient Boosting Trees': GBTClassifier(featuresCol='features', labelCol='action_taken', maxBins=2000),
}

In [32]:
def getModelHyperparams(model):
    params = model.extractParamMap()
    paramDict = {param.name:value for param, value in params.items()}
    return paramDict

In [35]:
for algo in models:

    with mlflow.start_run(run_name=algo) as run:
        
        print(f"========== {algo} ============")
        
        model = models[algo]
        
        # log model params
        params = getModelHyperparams(model)
        mlflow.log_params(params)

        # Train the model
        trained_model = model.fit(train_df)
        # log model
        mlflow.spark.log_model(trained_model, "model") 
    
        # Evaluate on Test data
        test_df_transformed = transformPipeModel.transform(test_df)
        test_predictions = trained_model.transform(test_df_transformed)
    
        results = evaluate_model(test_predictions)

        # log metrics
        mlflow.log_metrics({"Accuracy": results['accuracy'], "Precision":results['precision'], "Recall":results['recall'], "F1-score":results['f1_score'], "ROC":results['roc_auc']})
    
        trained_model.write().overwrite().save(f'./models/{algo}')
    
        print('\n')




25/01/20 22:59:31 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/01/20 22:59:31 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
25/01/20 22:59:35 ERROR Instrumentation: org.apache.hadoop.fs.UnsupportedFileSystemException: No FileSystem for scheme "mlflow-artifacts"
	at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:3443)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3466)
	at org.apache.hadoop.fs.FileSystem.access$300(FileSystem.java:174)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3574)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3521)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:540)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:365)
	at org.apache.spark.ml.util.FileSystemOverwrite.handleOverwrite(ReadWrite.scala:673)
	at org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:167)
	at org.apa



🏃 View run Logistic Regression at: http://127.0.0.1:8080/#/experiments/377674707853028999/runs/4afd0a83f8144d2288231f78725d715c
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/377674707853028999


25/01/20 22:59:52 ERROR Instrumentation: org.apache.hadoop.fs.UnsupportedFileSystemException: No FileSystem for scheme "mlflow-artifacts"
	at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:3443)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3466)
	at org.apache.hadoop.fs.FileSystem.access$300(FileSystem.java:174)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3574)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3521)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:540)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:365)
	at org.apache.spark.ml.util.FileSystemOverwrite.handleOverwrite(ReadWrite.scala:673)
	at org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:167)
	at org.apache.spark.ml.PipelineModel$PipelineModelWriter.super$save(Pipeline.scala:344)
	at org.apache.spark.ml.PipelineModel$PipelineModelWriter.$anonfun$save$4(Pipeline.scala:344)
	at org.apache.spark.ml.MLEvents.withSaveInst



🏃 View run Support Vector Machine at: http://127.0.0.1:8080/#/experiments/377674707853028999/runs/a796b4955fa64c7b882e02967bce81d5
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/377674707853028999


25/01/20 23:00:03 ERROR Instrumentation: org.apache.hadoop.fs.UnsupportedFileSystemException: No FileSystem for scheme "mlflow-artifacts"
	at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:3443)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3466)
	at org.apache.hadoop.fs.FileSystem.access$300(FileSystem.java:174)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3574)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3521)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:540)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:365)
	at org.apache.spark.ml.util.FileSystemOverwrite.handleOverwrite(ReadWrite.scala:673)
	at org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:167)
	at org.apache.spark.ml.PipelineModel$PipelineModelWriter.super$save(Pipeline.scala:344)
	at org.apache.spark.ml.PipelineModel$PipelineModelWriter.$anonfun$save$4(Pipeline.scala:344)
	at org.apache.spark.ml.MLEvents.withSaveInst



🏃 View run Factorization Machine at: http://127.0.0.1:8080/#/experiments/377674707853028999/runs/718ddbebd4a94c1da2171625b9b3ca30
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/377674707853028999


25/01/20 23:00:12 ERROR Instrumentation: org.apache.hadoop.fs.UnsupportedFileSystemException: No FileSystem for scheme "mlflow-artifacts"
	at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:3443)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3466)
	at org.apache.hadoop.fs.FileSystem.access$300(FileSystem.java:174)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3574)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3521)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:540)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:365)
	at org.apache.spark.ml.util.FileSystemOverwrite.handleOverwrite(ReadWrite.scala:673)
	at org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:167)
	at org.apache.spark.ml.PipelineModel$PipelineModelWriter.super$save(Pipeline.scala:344)
	at org.apache.spark.ml.PipelineModel$PipelineModelWriter.$anonfun$save$4(Pipeline.scala:344)
	at org.apache.spark.ml.MLEvents.withSaveInst



🏃 View run Decision Tree at: http://127.0.0.1:8080/#/experiments/377674707853028999/runs/8ffceefb0a4d48c7bae4acb634e10096
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/377674707853028999


25/01/20 23:00:21 ERROR Instrumentation: org.apache.hadoop.fs.UnsupportedFileSystemException: No FileSystem for scheme "mlflow-artifacts"
	at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:3443)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3466)
	at org.apache.hadoop.fs.FileSystem.access$300(FileSystem.java:174)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3574)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3521)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:540)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:365)
	at org.apache.spark.ml.util.FileSystemOverwrite.handleOverwrite(ReadWrite.scala:673)
	at org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:167)
	at org.apache.spark.ml.PipelineModel$PipelineModelWriter.super$save(Pipeline.scala:344)
	at org.apache.spark.ml.PipelineModel$PipelineModelWriter.$anonfun$save$4(Pipeline.scala:344)
	at org.apache.spark.ml.MLEvents.withSaveInst



🏃 View run Random Forest at: http://127.0.0.1:8080/#/experiments/377674707853028999/runs/9242f963136b41e6ac7cdd3e3ce497f6
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/377674707853028999


25/01/20 23:00:33 WARN DAGScheduler: Broadcasting large task binary with size 1005.5 KiB
25/01/20 23:00:33 WARN DAGScheduler: Broadcasting large task binary with size 1016.5 KiB
25/01/20 23:00:33 WARN DAGScheduler: Broadcasting large task binary with size 1031.3 KiB
25/01/20 23:00:33 WARN DAGScheduler: Broadcasting large task binary with size 1054.1 KiB
25/01/20 23:00:34 WARN DAGScheduler: Broadcasting large task binary with size 1054.6 KiB
25/01/20 23:00:34 WARN DAGScheduler: Broadcasting large task binary with size 1056.5 KiB
25/01/20 23:00:34 WARN DAGScheduler: Broadcasting large task binary with size 1059.8 KiB
25/01/20 23:00:34 WARN DAGScheduler: Broadcasting large task binary with size 1066.4 KiB
25/01/20 23:00:34 WARN DAGScheduler: Broadcasting large task binary with size 1098.0 KiB
25/01/20 23:00:34 WARN DAGScheduler: Broadcasting large task binary with size 1098.5 KiB
25/01/20 23:00:34 WARN DAGScheduler: Broadcasting large task binary with size 1099.3 KiB
25/01/20 23:00:34 WAR



🏃 View run Gradient Boosting Trees at: http://127.0.0.1:8080/#/experiments/377674707853028999/runs/be2eca69a79c4fa989f75377f7dc4a1d
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/377674707853028999


In [36]:
mlflow.end_run()

#### loading model using MLflow

In [40]:
logged_pipeline = 'runs:/bb974a3843a3442a95235a7db9b74a3b/transform pipeline'
# Load pipeline model
loaded_pipeline = mlflow.spark.load_model(logged_pipeline)

logged_model = 'runs:/082e38361608410aae2d962dc8288b3f/model'
# Load model
loaded_model = mlflow.spark.load_model(logged_model)

2025/01/20 23:01:32 INFO mlflow.spark: 'runs:/bb974a3843a3442a95235a7db9b74a3b/transform pipeline' resolved as 'mlflow-artifacts:/279265602963958873/bb974a3843a3442a95235a7db9b74a3b/artifacts/transform pipeline'


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/48 [00:00<?, ?it/s]

2025/01/20 23:01:32 INFO mlflow.spark: URI 'runs:/bb974a3843a3442a95235a7db9b74a3b/transform pipeline/sparkml' does not point to the current DFS.
2025/01/20 23:01:32 INFO mlflow.spark: File 'runs:/bb974a3843a3442a95235a7db9b74a3b/transform pipeline/sparkml' not found on DFS. Will attempt to upload the file.
2025/01/20 23:01:33 INFO mlflow.spark: 'runs:/082e38361608410aae2d962dc8288b3f/model' resolved as 'mlflow-artifacts:/279265602963958873/082e38361608410aae2d962dc8288b3f/artifacts/model'


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/16 [00:00<?, ?it/s]

2025/01/20 23:01:33 INFO mlflow.spark: URI 'runs:/082e38361608410aae2d962dc8288b3f/model/sparkml' does not point to the current DFS.
2025/01/20 23:01:33 INFO mlflow.spark: File 'runs:/082e38361608410aae2d962dc8288b3f/model/sparkml' not found on DFS. Will attempt to upload the file.


In [41]:
test_df = test_df.drop('action_taken')

In [43]:
test_transformed_df = loaded_pipeline.transform(test_df)
preds_df = loaded_model.transform(test_transformed_df)

In [44]:
preds_df

lei,loan_type,loan_purpose,loan_amount,interest_rate,loan_term,income,applicant_age,applicant_sex,applicant_credit_score_type,co_applicant_age,co_applicant_credit_score_type,derived_msa_md,state_code,county_code,property_value,total_units,occupancy_type,loan_type_index,loan_purpose_index,applicant_sex_index,total_units_index,occupancy_type_index,applicant_age_index,co_applicant_age_index,applicant_credit_score_type_index,co_applicant_credit_score_type_index,loan_type_OHE,loan_purpose_OHE,applicant_sex_OHE,total_units_OHE,occupancy_type_OHE,applicant_age_OHE,co_applicant_age_OHE,applicant_credit_score_type_OHE,co_applicant_credit_score_type_OHE,state_code_index,county_code_index,derived_msa_md_index,lei_index,num_vector,num_scaled_vector,features,rawPrediction,probability,prediction
01J4SO3XTWZF4PP38209,1,31,15000.0,4.95,60,94.0,8888,4,9,9999,10,25060,MS,28047,45000.0,1,3,0.0,0.0,2.0,0.0,1.0,5.0,0.0,3.0,0.0,"(3,[0],[1.0])","(5,[0],[1.0])","(4,[2],[1.0])","(3,[0],[1.0])","(2,[1],[1.0])","(7,[5],[1.0])","(8,[0],[1.0])","(8,[3],[1.0])","(9,[0],[1.0])",36.0,284.0,174.0,93.0,"[15000.0,4.949999...",[0.06611573754325...,"(58,[0,1,2,3,4,5,...",[0.64475038483749...,[0.65582650073893...,0.0
0S8H5NJFLHEVJXVTQ413,1,31,435000.0,3.2904205,360,124.0,35-44,1,1,35-44,1,42644,WA,53061,655000.0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,"(3,[0],[1.0])","(5,[0],[1.0])","(4,[0],[1.0])","(3,[0],[1.0])","(2,[0],[1.0])","(7,[0],[1.0])","(8,[1],[1.0])","(8,[0],[1.0])","(9,[2],[1.0])",10.0,66.0,17.0,577.0,"[435000.0,3.29042...",[1.91735638875442...,"(58,[0,1,2,3,4,5,...",[-2.8612907730043...,[0.05410060877803...,1.0
1IE8VN30JCEQV1H4R804,1,31,215000.0,3.2904205,360,109.0,55-64,1,2,9999,10,22744,FL,12011,355000.0,1,1,0.0,0.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0,"(3,[0],[1.0])","(5,[0],[1.0])","(4,[0],[1.0])","(3,[0],[1.0])","(2,[0],[1.0])","(7,[3],[1.0])","(8,[0],[1.0])","(8,[1],[1.0])","(9,[0],[1.0])",1.0,21.0,45.0,349.0,"[215000.0,3.29042...",[0.94765890478666...,"(58,[0,1,2,3,4,5,...",[-1.5122500108184...,[0.18060558102033...,1.0
213800XR2TCBQJSF1X93,1,31,365000.0,2.875,360,102.0,35-44,1,2,9999,10,31084,CA,6037,625000.0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,"(3,[0],[1.0])","(5,[0],[1.0])","(4,[0],[1.0])","(3,[0],[1.0])","(2,[0],[1.0])","(7,[0],[1.0])","(8,[0],[1.0])","(8,[1],[1.0])","(9,[0],[1.0])",0.0,0.0,2.0,140.0,"[365000.0,2.875,3...",[1.60881628021922...,"(58,[0,1,2,3,4,5,...",[-1.7766100765648...,[0.14472222588682...,1.0
2549006II76YXSS5XM65,1,31,205000.0,3.125,360,110.0,45-54,2,3,9999,10,47894,VA,51059,325000.0,1,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,"(3,[0],[1.0])","(5,[0],[1.0])","(4,[1],[1.0])","(3,[0],[1.0])","(2,[0],[1.0])","(7,[1],[1.0])","(8,[0],[1.0])","(8,[2],[1.0])","(9,[0],[1.0])",12.0,14.0,4.0,204.0,"[205000.0,3.125,3...",[0.90358174642449...,"(58,[0,1,2,3,4,5,...",[-1.5855524675315...,[0.17001055400556...,1.0
254900ACUWEGW702BR80,1,31,275000.0,3.0,360,78.0,45-54,2,3,9999,10,15804,NJ,34007,305000.0,1,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,"(3,[0],[1.0])","(5,[0],[1.0])","(4,[1],[1.0])","(3,[0],[1.0])","(2,[0],[1.0])","(7,[1],[1.0])","(8,[0],[1.0])","(8,[2],[1.0])","(9,[0],[1.0])",11.0,183.0,56.0,353.0,"[275000.0,3.0,360...",[1.21212185495969...,"(58,[0,1,2,3,4,5,...",[-1.8713743428235...,[0.13338277969392...,1.0
254900ACUWEGW702BR80,1,31,395000.0,3.0,360,332.0,35-44,1,1,>74,9,35614,NJ,34003,515000.0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,1.0,"(3,[0],[1.0])","(5,[0],[1.0])","(4,[0],[1.0])","(3,[0],[1.0])","(2,[0],[1.0])","(7,[0],[1.0])","(8,[7],[1.0])","(8,[0],[1.0])","(9,[1],[1.0])",11.0,52.0,10.0,353.0,"[395000.0,3.0,360...",[1.74104775530573...,"(58,[0,1,2,3,4,5,...",[-2.5163454881656...,[0.07472021517676...,1.0
254900HA4DQWAE0W3342,1,1,335000.0,2.625,360,81.0,35-44,4,9,35-44,9,19124,TX,48085,415000.0,1,1,0.0,1.0,2.0,0.0,0.0,0.0,1.0,3.0,1.0,"(3,[0],[1.0])","(5,[1],[1.0])","(4,[2],[1.0])","(3,[0],[1.0])","(2,[0],[1.0])","(7,[0],[1.0])","(8,[1],[1.0])","(8,[3],[1.0])","(9,[1],[1.0])",2.0,50.0,12.0,9.0,"[335000.0,2.625,3...",[1.47658480513271...,"(58,[0,1,2,3,4,5,...",[2.12763636553943...,[0.89356041160467...,0.0
254900HA4DQWAE0W3342,1,1,505000.0,2.5,360,265.0,45-54,4,9,35-44,9,19740,CO,8059,1035000.0,1,1,0.0,1.0,2.0,0.0,0.0,1.0,1.0,3.0,1.0,"(3,[0],[1.0])","(5,[1],[1.0])","(4,[2],[1.0])","(3,[0],[1.0])","(2,[0],[1.0])","(7,[1],[1.0])","(8,[1],[1.0])","(8,[3],[1.0])","(9,[1],[1.0])",6.0,37.0,6.0,9.0,"[505000.0,2.5,360...",[2.22589649728961...,"(58,[0,1,2,3,4,5,...",[2.06229132845680...,[0.88718370972536...,0.0
254900HA4DQWAE0W3342,1,31,135000.0,3.375,360,39.0,25-34,4,9,9999,9,36540,NE,31055,155000.0,1,1,0.0,0.0,2.0,0.0,0.0,2.0,0.0,3.0,1.0,"(3,[0],[1.0])","(5,[0],[1.0])","(4,[2],[1.0])","(3,[0],[1.0])","(2,[0],[1.0])","(7,[2],[1.0])","(8,[0],[1.0])","(8,[3],[1.0])","(9,[1],[1.0])",34.0,93.0,57.0,9.0,"[135000.0,3.375,3...",[0.59504163788930...,"(58,[0,1,2,3,4,5,...",[4.37657193065208...,[0.98758763312067...,0.0


### 7. Hyperparameter tuning (GBM model)

In [33]:
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit


In [34]:
gbt = GBTClassifier(featuresCol='features', labelCol='action_taken')

paramGrid = ParamGridBuilder()\
        .addGrid(gbt.maxDepth, [3,5,7])\
        .addGrid(gbt.maxBins, [2000, 3000, 5000])\
        .addGrid(gbt.stepSize, [0.05, 0.1, 0.2])\
        .build()

In [35]:
evaluator_accuracy = MulticlassClassificationEvaluator(labelCol='action_taken', predictionCol='prediction', metricName='accuracy')


In [36]:
train_validator_model = TrainValidationSplit(estimator=gbt,
                                            estimatorParamMaps=paramGrid,
                                            evaluator=evaluator_accuracy,
                                            trainRatio=0.8)

In [37]:
trained_tv_model = train_validator_model.fit(train_df)

best_gbt_model = trained_tv_model.bestModel

25/01/19 18:38:02 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 5000 to 4724 (= number of training instances)
25/01/19 18:38:05 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 5000 to 4724 (= number of training instances)
25/01/19 18:38:08 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 5000 to 4724 (= number of training instances)
25/01/19 18:38:12 WARN DAGScheduler: Broadcasting large task binary with size 1009.5 KiB
25/01/19 18:38:13 WARN DAGScheduler: Broadcasting large task binary with size 1014.6 KiB
25/01/19 18:38:13 WARN DAGScheduler: Broadcasting large task binary with size 1025.8 KiB
25/01/19 18:38:13 WARN DAGScheduler: Broadcasting large task binary with size 1041.7 KiB
25/01/19 18:38:13 WARN DAGScheduler: Broadcasting large task binary with size 1050.6 KiB
25/01/19 18:38:13 WARN DAGScheduler: Broadcasting large task binary with size 1066.1 KiB
25/01/19 18:38:13 WARN DAGScheduler: Broadcasting large task binary with size 1070.5 KiB

KeyboardInterrupt: 

In [None]:
# Evaluate on Test data
test_df_transformed = transformPipeModel.transform(test_df)
test_predictions = best_gbt_model.transform(test_df_transformed)

results = evaluate_model(test_predictions)

In [None]:
results

## Inference

In [None]:
from pyspark.ml import PipelineModel
from pyspark.ml.classification import LogisticRegressionModel, GBTClassificationModel

In [None]:
test_df = test_df.drop('action_taken')

In [None]:
len(test_df.columns)

In [None]:
gbm_model = GBTClassificationModel.load("models/Gradient Boosting Trees")

In [None]:
gbm_model

In [None]:
transform_model = PipelineModel.load('models/transformPipeModel')

In [None]:
test_df.count()

In [None]:
# Preprocess the test data and then make predictions 

test_transformed_df = transform_model.transform(test_df)
prediction_df = gbm_model.transform(test_transformed_df)

In [None]:
prediction_df

In [None]:
prediction_df.select('prediction')

In [None]:
test_point = test_df.limit(1)

In [None]:
test_point

In [None]:
test_point.printSchema()

In [None]:
transform_point = transform_model.transform(test_point)
predict_point = gbm_model.transform(transform_point)

In [None]:
result = predict_point.select('prediction')

In [None]:
result.collect()[0][0]