In [1]:
#import SparkSession
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('binary_class').getOrCreate()

In [2]:
#read the dataset
df=spark.read.csv('classification_data.csv',inferSchema=True,header=True)

In [None]:
#check the shape of the data 
print((df.count(),len(df.columns)))

In [None]:
#printSchema
df.printSchema()

In [None]:
#number of columns in dataset
df.columns

In [None]:
#view the dataset
df.show(5)

In [None]:
#Exploratory Data Analysis
df.describe().show()


In [None]:
df.groupBy('label').count().show()

In [None]:
df.groupBy('loan_purpose').count().show()

In [None]:
#converting categorical data to numerical form

In [3]:
#import required libraries
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler



In [4]:
loan_purpose_indexer = StringIndexer(inputCol="loan_purpose", outputCol="loan_index").fit(df)
df = loan_purpose_indexer.transform(df)
loan_encoder = OneHotEncoder(inputCol="loan_index", outputCol="loan_purpose_vec")
df = loan_encoder.transform(df)

In [None]:
df.select(['loan_purpose','loan_index','loan_purpose_vec']).show(3,False)

In [5]:
from pyspark.ml.feature import VectorAssembler

In [None]:
df.columns

In [6]:
df_assembler = VectorAssembler(inputCols=['is_first_loan',
 'total_credit_card_limit',
 'avg_percentage_credit_card_limit_used_last_year',
 'saving_amount',
 'checking_amount',
 'is_employed',
 'yearly_salary',
 'age',
 'dependent_number',
 'loan_purpose_vec'], outputCol="features")
df = df_assembler.transform(df)

In [None]:
df.printSchema()

In [None]:
df.select(['features','label']).show(10,False)

In [7]:
#select data for building model
model_df=df.select(['features','label'])

In [None]:
from pyspark.ml.classification import LogisticRegression

In [8]:
#split the data 
training_df,test_df=model_df.randomSplit([0.75,0.25])

In [None]:
training_df.count()

In [None]:
training_df.groupBy('label').count().show()

In [None]:
test_df.count()

In [None]:
test_df.groupBy('label').count().show()

In [None]:
log_reg=LogisticRegression().fit(training_df)

In [None]:
#Training Results

In [None]:
lr_summary=log_reg.summary

In [None]:
lr_summary.accuracy

In [None]:
lr_summary.areaUnderROC

In [None]:
print(lr_summary.precisionByLabel)

In [None]:
print(lr_summary.recallByLabel)

In [None]:
predictions = log_reg.transform(test_df)
predictions.show(10)


In [None]:
model_predictions = log_reg.transform(test_df)


In [None]:
model_predictions = log_reg.evaluate(test_df)


In [None]:
model_predictions.accuracy

In [None]:
model_predictions.weightedPrecision

In [None]:
model_predictions.recallByLabel

In [None]:
print(model_predictions.precisionByLabel)

In [None]:
model_predictions.areaUnderROC

In [9]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier()
rf_model = rf.fit(training_df)


In [10]:
model_predictions = rf_model.transform(test_df)


In [11]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator()

rf = RandomForestClassifier()
paramGrid = (ParamGridBuilder()
             .addGrid(rf.maxDepth, [5,10,20,25,30])
             .addGrid(rf.maxBins, [20,30,40 ])
             .addGrid(rf.numTrees, [5, 20,50])
             .build())
cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
cv_model = cv.fit(training_df)

In [12]:
best_rf_model = cv_model.bestModel

In [13]:
# Generate predictions for entire dataset
model_predictions = best_rf_model.transform(test_df)

In [14]:
true_pos=model_predictions.filter(model_predictions['label']==1).filter(model_predictions['prediction']==1).count()
actual_pos=model_predictions.filter(model_predictions['label']==1).count()
pred_pos=model_predictions.filter(model_predictions['prediction']==1).count()

In [15]:
#Recall 
float(true_pos)/(actual_pos)

0.912426614481409

In [16]:
#Precision on test Data 
float(true_pos)/(pred_pos)

0.8562901744719926