In [1]:
#import SparkSession
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('binary_class').getOrCreate()

In [11]:
#read the dataset
df=spark.read.csv('classification_data.csv',inferSchema=True,header=True)

In [3]:
from pyspark.sql.functions import *


In [12]:
#check the shape of the data 
print((df.count(),len(df.columns)))

(46751, 12)


In [13]:
#printSchema
df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- is_first_loan: integer (nullable = true)
 |-- total_credit_card_limit: integer (nullable = true)
 |-- avg_percentage_credit_card_limit_used_last_year: double (nullable = true)
 |-- saving_amount: integer (nullable = true)
 |-- checking_amount: integer (nullable = true)
 |-- is_employed: integer (nullable = true)
 |-- yearly_salary: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- dependent_number: integer (nullable = true)
 |-- loan_defaulter: integer (nullable = true)



In [14]:
#number of columns in dataset
df.columns

['loan_id',
 'loan_purpose',
 'is_first_loan',
 'total_credit_card_limit',
 'avg_percentage_credit_card_limit_used_last_year',
 'saving_amount',
 'checking_amount',
 'is_employed',
 'yearly_salary',
 'age',
 'dependent_number',
 'loan_defaulter']

In [15]:
#view the dataset
df.show(5)

+-------+------------+-------------+-----------------------+-----------------------------------------------+-------------+---------------+-----------+-------------+---+----------------+--------------+
|loan_id|loan_purpose|is_first_loan|total_credit_card_limit|avg_percentage_credit_card_limit_used_last_year|saving_amount|checking_amount|is_employed|yearly_salary|age|dependent_number|loan_defaulter|
+-------+------------+-------------+-----------------------+-----------------------------------------------+-------------+---------------+-----------+-------------+---+----------------+--------------+
|    A_1|    personal|            1|                   7900|                                            0.8|         1103|           6393|          1|        16400| 42|               4|             0|
|    A_2|    personal|            0|                   3300|                                           0.29|         2588|            832|          1|        75500| 56|               1|           

In [16]:
#Exploratory Data Analysis
df.describe().show()


+-------+-------+------------+------------------+-----------------------+-----------------------------------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+
|summary|loan_id|loan_purpose|     is_first_loan|total_credit_card_limit|avg_percentage_credit_card_limit_used_last_year|     saving_amount|   checking_amount|       is_employed|     yearly_salary|               age|  dependent_number|     loan_defaulter|
+-------+-------+------------+------------------+-----------------------+-----------------------------------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+
|  count|  46751|       46751|             46751|                  46751|                                          46751|             46751|             46751|             46751|             46751|             46751|             467

In [17]:
df.groupBy('loan_defaulter').count().show()

+--------------+-----+
|loan_defaulter|count|
+--------------+-----+
|             1|16201|
|             0|30550|
+--------------+-----+



In [18]:
df.groupBy('loan_purpose').count().show()

+------------+-----+
|loan_purpose|count|
+------------+-----+
|      others| 6763|
|   emergency| 7562|
|    property|11388|
|  operations|10580|
|    personal|10458|
+------------+-----+



In [120]:
#converting categorical data to numerical form

In [21]:
#import required libraries
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler



In [22]:
loan_purpose_indexer = StringIndexer(inputCol="loan_purpose", outputCol="loan_purpose").fit(df)
df = loan_purpose_indexer.transform(df)
loan_encoder = OneHotEncoder(inputCol="loan_index", outputCol="loan_purpose_vec")
df = loan_encoder.transform(df)

In [63]:
df.select(['loan_purpose','loan_purpose','loan_purpose_vec']).show(3,False)

+------------+------------+----------------+
|loan_purpose|loan_purpose|loan_purpose_vec|
+------------+------------+----------------+
|personal    |personal    |(4,[2],[1.0])   |
|personal    |personal    |(4,[2],[1.0])   |
|personal    |personal    |(4,[2],[1.0])   |
+------------+------------+----------------+
only showing top 3 rows



In [24]:
from pyspark.ml.feature import VectorAssembler

In [25]:
df.columns

['loan_id',
 'loan_purpose',
 'is_first_loan',
 'total_credit_card_limit',
 'avg_percentage_credit_card_limit_used_last_year',
 'saving_amount',
 'checking_amount',
 'is_employed',
 'yearly_salary',
 'age',
 'dependent_number',
 'loan_defaulter',
 'loan_index',
 'loan_purpose_vec']

In [28]:
df_assembler = VectorAssembler(inputCols=['is_first_loan',
 'total_credit_card_limit',
 'avg_percentage_credit_card_limit_used_last_year',
 'saving_amount',
 'checking_amount',
 'is_employed',
 'yearly_salary',
 'age',
 'dependent_number',
 'loan_purpose_vec'], outputCol="features")
df = df_assembler.transform(df)

In [29]:
df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- is_first_loan: integer (nullable = true)
 |-- total_credit_card_limit: integer (nullable = true)
 |-- avg_percentage_credit_card_limit_used_last_year: double (nullable = true)
 |-- saving_amount: integer (nullable = true)
 |-- checking_amount: integer (nullable = true)
 |-- is_employed: integer (nullable = true)
 |-- yearly_salary: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- dependent_number: integer (nullable = true)
 |-- loan_defaulter: integer (nullable = true)
 |-- loan_index: double (nullable = false)
 |-- loan_purpose_vec: vector (nullable = true)
 |-- features: vector (nullable = true)



In [30]:
df.select(['features','loan_defaulter']).show(10,False)

+--------------------------------------------------------------------+--------------+
|features                                                            |loan_defaulter|
+--------------------------------------------------------------------+--------------+
|[1.0,7900.0,0.8,1103.0,6393.0,1.0,16400.0,42.0,4.0,0.0,0.0,1.0,0.0] |0             |
|[0.0,3300.0,0.29,2588.0,832.0,1.0,75500.0,56.0,1.0,0.0,0.0,1.0,0.0] |0             |
|[0.0,7600.0,0.9,1651.0,8868.0,1.0,59000.0,46.0,1.0,0.0,0.0,1.0,0.0] |0             |
|[1.0,3400.0,0.38,1269.0,6863.0,1.0,26000.0,55.0,8.0,0.0,0.0,1.0,0.0]|0             |
|[0.0,2600.0,0.89,1310.0,3423.0,1.0,9700.0,41.0,4.0,0.0,0.0,0.0,1.0] |1             |
|[0.0,7600.0,0.51,1040.0,2406.0,1.0,22900.0,52.0,0.0,0.0,1.0,0.0,0.0]|0             |
|[1.0,6900.0,0.82,2408.0,5556.0,1.0,34800.0,48.0,4.0,0.0,1.0,0.0,0.0]|0             |
|[0.0,5700.0,0.56,1933.0,4139.0,1.0,32500.0,64.0,2.0,0.0,0.0,1.0,0.0]|0             |
|[1.0,3400.0,0.95,3866.0,4131.0,1.0,13300.0,23.0,3.0,0

In [31]:
#select data for building model
model_df=df.select(['features','loan_defaulter'])

In [32]:
from pyspark.ml.classification import LogisticRegression

In [33]:
#split the data 
training_df,test_df=model_df.randomSplit([0.75,0.25])

In [34]:
training_df.count()

34958

In [35]:
training_df.groupBy('loan_defaulter').count().show()

+--------------+-----+
|loan_defaulter|count|
+--------------+-----+
|             1|12048|
|             0|22910|
+--------------+-----+



In [36]:
test_df.count()

11793

In [37]:
test_df.groupBy('loan_defaulter').count().show()

+--------------+-----+
|loan_defaulter|count|
+--------------+-----+
|             1| 4153|
|             0| 7640|
+--------------+-----+



In [38]:
log_reg=LogisticRegression(labelCol='loan_defaulter').fit(training_df)

In [None]:
#Training Results

In [39]:
lr_summary=log_reg.summary

In [40]:
lr_summary.accuracy

0.8939298586875679

In [41]:
lr_summary.areaUnderROC

0.9587456481363935

In [42]:
print(lr_summary.precisionByLabel)

[0.9233245149911816, 0.8396318618667535]


In [43]:
print(lr_summary.recallByLabel)

[0.914054997817547, 0.8556606905710491]


In [45]:
predictions = log_reg.transform(test_df)
predictions.show(10)


+--------------------+--------------+--------------------+--------------------+----------+
|            features|loan_defaulter|       rawPrediction|         probability|prediction|
+--------------------+--------------+--------------------+--------------------+----------+
|(13,[0,1,2,3,4,7]...|             1|[-3.4630360774167...|[0.03038246469741...|       1.0|
|(13,[0,1,2,3,4,7]...|             1|[-5.5391195110590...|[0.00391460129742...|       1.0|
|(13,[0,1,2,3,4,7]...|             0|[1.00238593296486...|[0.73152742283114...|       0.0|
|(13,[0,1,2,3,4,7]...|             1|[-1.8290704519648...|[0.13834904603406...|       1.0|
|(13,[0,1,2,3,4,7]...|             1|[-1.5501728962289...|[0.17506129798003...|       1.0|
|(13,[0,1,2,3,4,7]...|             0|[6.60737916543425...|[0.99865145442765...|       0.0|
|(13,[0,1,2,3,4,7]...|             0|[7.50587822302399...|[0.99945045940723...|       0.0|
|(13,[0,1,2,3,4,7,...|             1|[-4.4555325192703...|[0.01148079400371...|       1.0|

In [47]:
model_predictions = log_reg.transform(test_df)
model_predictions.columns

['features', 'loan_defaulter', 'rawPrediction', 'probability', 'prediction']

In [48]:
model_predictions = log_reg.evaluate(test_df)


In [49]:
model_predictions.accuracy

0.8945984906300347

In [50]:
model_predictions.weightedPrecision

0.8951909857782705

In [52]:
model_predictions.recallByLabel

[0.9129581151832461, 0.8608235010835541]

In [53]:
print(model_predictions.precisionByLabel)

[0.9234741162452006, 0.8431603773584906]


In [54]:
model_predictions.areaUnderROC

0.9594316478468224

In [56]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(numTrees=50,maxDepth=30,labelCol='loan_defaulter')
rf_model = rf.fit(training_df)


In [57]:
model_predictions = rf_model.transform(test_df)


In [59]:
true_pos=model_predictions.filter(model_predictions['loan_defaulter']==1).filter(model_predictions['prediction']==1).count()
actual_pos=model_predictions.filter(model_predictions['loan_defaulter']==1).count()
pred_pos=model_predictions.filter(model_predictions['prediction']==1).count()

In [60]:
#Recall 
float(true_pos)/(actual_pos)

0.8979051288225379

In [61]:
#Precision on test Data 
float(true_pos)/(pred_pos)

0.8660009289363678