In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder \
    .appName("Final") \
    .getOrCreate()

sc = spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/04 00:36:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
spark

In [4]:
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

In [5]:
df = spark.read.parquet('./cleaned_dataset')

In [6]:
df.printSchema()

root
 |-- lei: string (nullable = true)
 |-- loan_type: integer (nullable = true)
 |-- loan_purpose: integer (nullable = true)
 |-- loan_amount: float (nullable = true)
 |-- interest_rate: float (nullable = true)
 |-- loan_term: integer (nullable = true)
 |-- action_taken: integer (nullable = true)
 |-- income: float (nullable = true)
 |-- applicant_age: string (nullable = true)
 |-- applicant_sex: integer (nullable = true)
 |-- applicant_credit_score_type: integer (nullable = true)
 |-- co_applicant_age: string (nullable = true)
 |-- co_applicant_credit_score_type: integer (nullable = true)
 |-- derived_msa_md: integer (nullable = true)
 |-- state_code: string (nullable = true)
 |-- county_code: string (nullable = true)
 |-- property_value: float (nullable = true)
 |-- total_units: integer (nullable = true)
 |-- occupancy_type: integer (nullable = true)



In [7]:
len(df.columns)

19

In [8]:
df.count()

7366

### 4. Train Test split

In [9]:
train_df, test_df = df.randomSplit(weights=[0.8, 0.2], seed=100)

In [10]:
train_df.count()

5862

In [11]:
test_df.count()

1504

### 5. Feature transformation

In [12]:
from pyspark.sql.types import StringType

In [13]:
from pyspark.ml.feature import StringIndexer, StandardScaler, VectorAssembler, FeatureHasher
from pyspark.ml import Pipeline

In [14]:
# Define categorical and numerical features
categorical_features = [field.name for field in train_df.schema.fields if field.dataType is StringType()]
numerical_features = [field for field in train_df.columns if field not in categorical_features]

continuous_numerical_features = ["loan_amount", "interest_rate", "loan_term", "income", "property_value"]
discrete_numerical_features = [item for item in numerical_features if item not in continuous_numerical_features]

In [15]:
categorical_features.remove('county_code')

In [16]:
categorical_features

['lei', 'applicant_age', 'co_applicant_age', 'state_code']

In [17]:
continuous_numerical_features

['loan_amount', 'interest_rate', 'loan_term', 'income', 'property_value']

In [18]:
discrete_numerical_features

['loan_type',
 'loan_purpose',
 'action_taken',
 'applicant_sex',
 'applicant_credit_score_type',
 'co_applicant_credit_score_type',
 'derived_msa_md',
 'total_units',
 'occupancy_type']

In [19]:
cat_indexed_features = [f"{cat}_indexed" for cat in categorical_features]

In [20]:
# perform Label encoding on categorical features
labelEncoder = StringIndexer(inputCols=categorical_features, 
                           outputCols=cat_indexed_features, handleInvalid="skip")

In [21]:
# perform feature hashing on 'county_code' as a lot of distinct values
hasher = FeatureHasher(inputCols=["county_code"], outputCol="county_code_hashed", numFeatures=1000)

In [22]:
# perform Standard scaling on continuous numerical features
numAssembler = VectorAssembler(inputCols=continuous_numerical_features, 
                            outputCol="con_num_features")

numScaler = StandardScaler(inputCol="con_num_features", outputCol="con_num_features_scaled")

In [23]:
# assemble all the features together
featureAssembler = VectorAssembler(inputCols=["con_num_features_scaled", "county_code_hashed"]+cat_indexed_features, outputCol='features')

In [24]:
# make the Pipeline
transformPipeline = Pipeline(stages = [labelEncoder, hasher, numAssembler, numScaler, featureAssembler])

In [25]:
# train it
transformPipeModel = transformPipeline.fit(train_df)

In [26]:
train_df = transformPipeModel.transform(train_df)

In [27]:
train_df

24/12/04 00:36:44 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


lei,loan_type,loan_purpose,loan_amount,interest_rate,loan_term,action_taken,income,applicant_age,applicant_sex,applicant_credit_score_type,co_applicant_age,co_applicant_credit_score_type,derived_msa_md,state_code,county_code,property_value,total_units,occupancy_type,lei_indexed,applicant_age_indexed,co_applicant_age_indexed,state_code_indexed,county_code_hashed,con_num_features,con_num_features_scaled,features
1IE8VN30JCEQV1H4R804,1,1,495000.0,3.5,360,0,202.0,45-54,1,2,45-54,9,35614,NY,36119,785000.0,1,1,260.0,1.0,2.0,7.0,"(1000,[493],[1.0])","[495000.0,3.5,360...",[1.91237629199108...,"(1009,[0,1,2,3,4,..."
1IE8VN30JCEQV1H4R804,1,1,865000.0,2.877,360,1,541.0,45-54,1,1,>74,9,99999,MA,25007,2255000.0,1,3,260.0,1.0,7.0,17.0,"(1000,[372],[1.0])","[865000.0,2.87700...",[3.34182927792381...,"(1009,[0,1,2,3,4,..."
1IE8VN30JCEQV1H4R804,1,32,25000.0,3.2839885,360,0,30.0,>74,3,3,9999,10,10900,PA,42095,355000.0,1,1,260.0,6.0,0.0,5.0,"(1000,[518],[1.0])","[25000.0,3.283988...",[0.09658466121167...,"(1009,[0,1,2,3,4,..."
1VUZZZCW0TWP6R7N3Z33,1,31,365000.0,3.2839885,360,0,130.0,65-74,1,2,65-74,3,14860,CT,9001,435000.0,1,1,568.0,4.0,6.0,33.0,"(1000,[699],[1.0])","[365000.0,3.28398...",[1.41013605369039...,"(1009,[0,1,2,3,4,..."
20TVKH7M13MUBGE80C53,1,4,75000.0,3.25,300,1,400.0,35-44,3,3,9999,10,47664,MI,26099,1025000.0,1,1,208.0,0.0,0.0,13.0,"(1000,[864],[1.0])","[75000.0,3.25,300...",[0.28975398363501...,"(1009,[0,1,2,3,4,..."
20TVKH7M13MUBGE80C53,1,31,505000.0,3.125,360,1,354.0,45-54,1,9,45-54,1,19804,MI,26163,695000.0,1,1,208.0,1.0,2.0,13.0,"(1000,[304],[1.0])","[505000.0,3.125,3...",[1.95101015647575...,"(1009,[0,1,2,3,4,..."
20TVKH7M13MUBGE80C53,1,32,245000.0,3.2839885,180,0,86.0,55-64,1,9,55-64,3,99999,MI,26133,605000.0,1,1,208.0,3.0,5.0,13.0,"(1000,[673],[1.0])","[245000.0,3.28398...",[0.94652967987437...,"(1009,[0,1,2,3,4,..."
20TVKH7M13MUBGE80C53,1,32,385000.0,2.625,180,1,294.0,45-54,1,3,35-44,9,18140,OH,39041,675000.0,1,1,208.0,1.0,1.0,9.0,"(1000,[480],[1.0])","[385000.0,2.625,1...",[1.48740378265973...,"(1009,[0,1,2,3,4,..."
254900AHLSHUJOTJG402,4,1,175000.0,2.5,360,1,63.0,<25,1,1,9999,10,12260,SC,45003,175000.0,1,1,587.0,7.0,0.0,23.0,"(1000,[336],[1.0])","[175000.0,2.5,360...",[0.67609262848169...,"(1009,[0,1,2,3,4,..."
254900CIEUZUO7CHPG88,1,1,25000.0,4.25,120,1,56.0,45-54,1,2,45-54,9,24580,WI,55083,35000.0,1,3,138.0,1.0,2.0,21.0,"(1000,[765],[1.0])","[25000.0,4.25,120...",[0.09658466121167...,"(1009,[0,1,2,3,4,..."


### 6. Models Prediction

In [34]:
from pyspark.ml.classification import LogisticRegression, LinearSVC, NaiveBayes, DecisionTreeClassifier, RandomForestClassifier, GBTClassifier, FMClassifier

In [31]:
# evaluate the model from 'y_true' and 'y_pred' 
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

In [33]:
def evaluate_model(predictions, label_col='action_taken', prediction_col='prediction', raw_prediction_col='rawPrediction'):
    '''It returns classification evaluation metrics like accuracy, precision, f1, recall and roc'''
    
    # Initialize evaluators
    evaluator_accuracy = MulticlassClassificationEvaluator(labelCol=label_col, predictionCol=prediction_col, metricName='accuracy')
    evaluator_precision = MulticlassClassificationEvaluator(labelCol=label_col, predictionCol=prediction_col, metricName='weightedPrecision')
    evaluator_recall = MulticlassClassificationEvaluator(labelCol=label_col, predictionCol=prediction_col, metricName='weightedRecall')
    evaluator_f1 = MulticlassClassificationEvaluator(labelCol=label_col, predictionCol=prediction_col, metricName='f1')
    evaluator_roc = BinaryClassificationEvaluator(labelCol=label_col, rawPredictionCol=raw_prediction_col, metricName='areaUnderROC')

    # Calculate metrics
    accuracy = evaluator_accuracy.evaluate(predictions)
    precision = evaluator_precision.evaluate(predictions)
    recall = evaluator_recall.evaluate(predictions)
    f1_score = evaluator_f1.evaluate(predictions)
    roc_auc = evaluator_roc.evaluate(predictions)

    # Return all metrics as a dictionary
    metrics = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score,
        'roc_auc': roc_auc
    }

    return metrics

In [35]:
models = {
    'Logistic Regression': LogisticRegression(featuresCol='features', labelCol='action_taken'),
    'Support Vector Machine': LinearSVC(featuresCol='features', labelCol='action_taken'),
    # 'Naive Bayes': NaiveBayes(featuresCol='features', labelCol='action_taken'),
    'Factorization Machine': FMClassifier(featuresCol='features', labelCol='action_taken'),
    'Decision Tree': DecisionTreeClassifier(featuresCol='features', labelCol='action_taken', maxBins=2000),
    'Random Forest': RandomForestClassifier(featuresCol='features', labelCol='action_taken', maxBins=2000),
    'Gradient Boosting Trees': GBTClassifier(featuresCol='features', labelCol='action_taken', maxBins=2000),
}

In [36]:
for algo in models:
    print(f"========== {algo} ============")

    # Train the model
    model = models[algo]
    trained_model = model.fit(train_df)

    # Evaluate on Test data
    test_df_transformed = transformPipeModel.transform(test_df)
    test_predictions = trained_model.transform(test_df_transformed)

    results = evaluate_model(test_predictions)
    print("accuracy: {:.4f}".format(results['accuracy']))
    print("precison: {:.4f}".format(results['precision']))
    print("recall: {:.4f}".format(results['recall']))
    print("f1-score: {:.4f}".format(results['f1_score']))
    print("ROC: {:.4f}".format(results['roc_auc']))

    print('\n')

accuracy: 0.7024
precison: 0.6698
recall: 0.7024
f1-score: 0.6767
ROC: 0.6367


accuracy: 0.6980
precison: 0.6521
recall: 0.6980
f1-score: 0.6578
ROC: 0.6416


accuracy: 0.6293
precison: 0.6529
recall: 0.6293
f1-score: 0.6389
ROC: 0.6036


accuracy: 0.9059
precison: 0.9136
recall: 0.9059
f1-score: 0.9002
ROC: 0.8855


accuracy: 0.7241
precison: 0.7312
recall: 0.7241
f1-score: 0.6238
ROC: 0.7547


accuracy: 0.9138
precison: 0.9128
recall: 0.9138
f1-score: 0.9126
ROC: 0.9438


