In [1]:
pip install pyspark 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/




# loading required libraries


In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, DoubleType
from pyspark.sql.functions import mean as _mean, stddev as _stddev, col, when
from statistics import mode as _mode
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, GBTClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.mllib.util import MLUtils
import seaborn as sns

In [3]:
spark_context = SparkContext.getOrCreate()
if (spark_context is None):
    spark_context = SparkContext(master = "local[4]", appName = "Assignment 2")
spark = SparkSession(sparkContext = spark_context)

### Loading the Dataset

In [4]:
Creditcard = spark.read.csv('creditcard.csv', inferSchema = True, header = True)

In [5]:
Creditcard.count()

284807

In [6]:
Creditcard=Creditcard.drop('Time')

In [7]:
Creditcard.show(5)

+------------------+-------------------+----------------+------------------+-------------------+-------------------+-------------------+------------------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+-------------------+--------------------+-------------------+------------------+------------------+------------------+------------------+--------------------+-------------------+------+-----+

|                V1|                 V2|              V3|                V4|                 V5|                 V6|                 V7|                V8|                V9|                V10|               V11|               V12|               V13|               V14|               V15|               V16|               V17|                V18|               V19|                V20|                 V21|                V22|              

In [8]:
Creditcard.columns

['V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'Amount',
 'Class']

### Finding Missing null values in each column

In [9]:
def missing():
    
    for columns in Creditcard.columns:
        print(columns + ' has number of NULLs : ' + str(Creditcard[Creditcard[columns] == 'NA'].count()))

In [10]:
missing()

V1 has number of NULLs : 0

V2 has number of NULLs : 0

V3 has number of NULLs : 0

V4 has number of NULLs : 0

V5 has number of NULLs : 0

V6 has number of NULLs : 0

V7 has number of NULLs : 0

V8 has number of NULLs : 0

V9 has number of NULLs : 0

V10 has number of NULLs : 0

V11 has number of NULLs : 0

V12 has number of NULLs : 0

V13 has number of NULLs : 0

V14 has number of NULLs : 0

V15 has number of NULLs : 0

V16 has number of NULLs : 0

V17 has number of NULLs : 0

V18 has number of NULLs : 0

V19 has number of NULLs : 0

V20 has number of NULLs : 0

V21 has number of NULLs : 0

V22 has number of NULLs : 0

V23 has number of NULLs : 0

V24 has number of NULLs : 0

V25 has number of NULLs : 0

V26 has number of NULLs : 0

V27 has number of NULLs : 0

V28 has number of NULLs : 0

Amount has number of NULLs : 0

Class has number of NULLs : 0


In [11]:
Creditcard=Creditcard.withColumnRenamed('Class','LABEL')

In [12]:
Creditcard.columns

['V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'Amount',
 'LABEL']

### Data Transformation

In [13]:
Features_List = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7','V8', 'V9','V10','V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17','V18', 'V19', 'Amount','LABEL']

In [14]:
for columns in Features_List:
     Creditcard= Creditcard.withColumn(columns, Creditcard[columns].cast(DoubleType()))

In [15]:
Creditcard.printSchema()

root

 |-- V1: double (nullable = true)

 |-- V2: double (nullable = true)

 |-- V3: double (nullable = true)

 |-- V4: double (nullable = true)

 |-- V5: double (nullable = true)

 |-- V6: double (nullable = true)

 |-- V7: double (nullable = true)

 |-- V8: double (nullable = true)

 |-- V9: double (nullable = true)

 |-- V10: double (nullable = true)

 |-- V11: double (nullable = true)

 |-- V12: double (nullable = true)

 |-- V13: double (nullable = true)

 |-- V14: double (nullable = true)

 |-- V15: double (nullable = true)

 |-- V16: double (nullable = true)

 |-- V17: double (nullable = true)

 |-- V18: double (nullable = true)

 |-- V19: double (nullable = true)

 |-- V20: double (nullable = true)

 |-- V21: double (nullable = true)

 |-- V22: double (nullable = true)

 |-- V23: double (nullable = true)

 |-- V24: double (nullable = true)

 |-- V25: double (nullable = true)

 |-- V26: double (nullable = true)

 |-- V27: double (nullable = true)

 |-- V28: double (nullable = tr

## Create the Feature Vector and Divide the Dataset

In [16]:
assembler = VectorAssembler(inputCols = Features_List,outputCol = 'feature_vector')
dataframe = assembler.transform(Creditcard) 

In [17]:
dataframe.show(5)

+------------------+-------------------+----------------+------------------+-------------------+-------------------+-------------------+------------------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+-------------------+--------------------+-------------------+------------------+------------------+------------------+------------------+--------------------+-------------------+------+-----+--------------------+

|                V1|                 V2|              V3|                V4|                 V5|                 V6|                 V7|                V8|                V9|                V10|               V11|               V12|               V13|               V14|               V15|               V16|               V17|                V18|               V19|                V20|                 V21|             

In [18]:
df_test, df_train = dataframe.randomSplit([0.3, 0.7], seed = 123)

In [19]:
df_test.count()

85583

In [20]:
df_train.count()

199224

# Apply Machine Learning Classification Algorithms on the Dataset and Compare their Accuracy.

### Decision Trees

In [21]:
Decision_Tree_Classifier = DecisionTreeClassifier(labelCol = 'LABEL', featuresCol = 'feature_vector')
Decision_Tree_Model = Decision_Tree_Classifier.fit(df_train)

In [22]:
Decision_Tree_Prediction = Decision_Tree_Model.transform(df_test)

In [24]:
Decision_Tree_Prediction.show(5)

+-----------------+-----------------+-----------------+----------------+-----------------+-----------------+-----------------+------------------+-----------------+------------------+-----------------+-----------------+-------------------+-----------------+------------------+-------------------+-----------------+-----------------+------------------+-----------------+------------------+-----------------+------------------+------------------+-----------------+-------------------+-----------------+-----------------+-------+-----+--------------------+--------------+-----------+----------+

|               V1|               V2|               V3|              V4|               V5|               V6|               V7|                V8|               V9|               V10|              V11|              V12|                V13|              V14|               V15|                V16|              V17|              V18|               V19|              V20|               V21|              V22|   

In [25]:
Decision_Tree_Evaluator = MulticlassClassificationEvaluator(labelCol =  'LABEL', predictionCol = 'prediction')

In [26]:
Decision_Tree_Accuracy = Decision_Tree_Evaluator.evaluate(Decision_Tree_Prediction)

In [27]:
print('Decision Tree Accuracy is : ' + str(Decision_Tree_Accuracy * 100))
print('Test Error is : ' + str(1 - Decision_Tree_Accuracy))

Decision Tree Accuracy is : 100.0

Test Error is : 0.0


### Random Forest

In [30]:
Random_Forest_Classifier = RandomForestClassifier(labelCol = 'LABEL', featuresCol = 'feature_vector',  maxDepth = 5,
    maxBins = 32, numTrees = 500)

Random_Forest_Model = Random_Forest_Classifier.fit(df_train)

In [31]:
Random_Forest_Prediction = Random_Forest_Model.transform(df_test)

In [32]:
Random_Forest_Prediction.show(5)

+-----------------+-----------------+-----------------+----------------+-----------------+-----------------+-----------------+------------------+-----------------+------------------+-----------------+-----------------+-------------------+-----------------+------------------+-------------------+-----------------+-----------------+------------------+-----------------+------------------+-----------------+------------------+------------------+-----------------+-------------------+-----------------+-----------------+-------+-----+--------------------+--------------------+--------------------+----------+

|               V1|               V2|               V3|              V4|               V5|               V6|               V7|                V8|               V9|               V10|              V11|              V12|                V13|              V14|               V15|                V16|              V17|              V18|               V19|              V20|               V21|      

In [33]:
Random_Forest_Evaluator = MulticlassClassificationEvaluator(labelCol = 'LABEL', predictionCol = 'prediction')

In [34]:
Random_Forest_Accuracy = Random_Forest_Evaluator.evaluate(Random_Forest_Prediction)

In [35]:

print('Random Forest Accuracy is : ' + str(Random_Forest_Accuracy * 100))
print('Test Error is : ' + str(1 - Random_Forest_Accuracy))

Random Forest Accuracy is : 100.0

Test Error is : 0.0


### Logistic Regression

In [36]:
Logistic_Regression_Classifier = LogisticRegression(regParam = 0.3, labelCol = "LABEL", featuresCol = 'feature_vector', \
                                       maxIter = 20,  elasticNetParam = 0.8)

In [37]:
Logistic_Regression_Model = Logistic_Regression_Classifier.fit(df_train)

In [38]:
Logistic_Regression_Prediction = Logistic_Regression_Model.transform(df_test)

Logistic_Regression_Prediction.show(5)

+-----------------+-----------------+-----------------+----------------+-----------------+-----------------+-----------------+------------------+-----------------+------------------+-----------------+-----------------+-------------------+-----------------+------------------+-------------------+-----------------+-----------------+------------------+-----------------+------------------+-----------------+------------------+------------------+-----------------+-------------------+-----------------+-----------------+-------+-----+--------------------+--------------------+--------------------+----------+

|               V1|               V2|               V3|              V4|               V5|               V6|               V7|                V8|               V9|               V10|              V11|              V12|                V13|              V14|               V15|                V16|              V17|              V18|               V19|              V20|               V21|      

In [39]:
Logistic_Regression_Evaluator = MulticlassClassificationEvaluator(labelCol = "LABEL", predictionCol = "prediction", \
                                                     metricName = "accuracy")

In [40]:
Logistic_Regression_Accuracy = Logistic_Regression_Evaluator.evaluate(Logistic_Regression_Prediction)

In [41]:
print('Logistic Regression Accuracy is : ' + str(Logistic_Regression_Accuracy * 100))
print('Test Error is : ' + str(1 - Logistic_Regression_Accuracy))

Logistic Regression Accuracy is : 99.84810067419932

Test Error is : 0.0015189932580068621


# Calculate the confusion matrix and find the precision, recall, and F1 score of each classification algorithm.

### Decision Trees Model Performance

In [42]:
Decision_Tree_Prediction_Labels = Decision_Tree_Prediction.select(['prediction', 'LABEL'])
Decision_Tree_KPI = MulticlassMetrics(Decision_Tree_Prediction_Labels.rdd.map(list))




In [43]:
Decision_Tree_confusion_matrix = Decision_Tree_KPI.confusionMatrix().toArray()
print('DT Confusion Matrix')
print(Decision_Tree_confusion_matrix)

DT Confusion Matrix

[[85453.     0.]

 [    0.   130.]]


In [44]:
Decision_Tree_precision = (Decision_Tree_confusion_matrix[0][0]) / (Decision_Tree_confusion_matrix[0][0] + Decision_Tree_confusion_matrix[1][0])
print('Decision_Tree Precision = ' + str(Decision_Tree_precision))

Decision_Tree Precision = 1.0


In [45]:
Decision_Tree_recall = (Decision_Tree_confusion_matrix[0][0]) / (Decision_Tree_confusion_matrix[0][0] + Decision_Tree_confusion_matrix[0][1])
print('Decision_Tree Recall = ' + str(Decision_Tree_recall))

Decision_Tree Recall = 1.0


In [46]:
Decision_Tree_f1Score = (Decision_Tree_precision * Decision_Tree_recall) / (Decision_Tree_precision + Decision_Tree_recall) * 2
print('Decision_Tree F1 Score = ' + str(Decision_Tree_f1Score))

Decision_Tree F1 Score = 1.0


### Random Forest Model Performance

In [47]:
Random_Forest_Prediction_Labels = Random_Forest_Prediction.select(['prediction', 'LABEL'])

Random_Forest_KPI = MulticlassMetrics(Random_Forest_Prediction_Labels.rdd.map(list))

In [48]:
Random_Forest_confusion_matrix = Random_Forest_KPI.confusionMatrix().toArray()

print('Random_Forest Confusion Matrix')

print(Random_Forest_confusion_matrix)

Random_Forest Confusion Matrix

[[85453.     0.]

 [    0.   130.]]


In [49]:
Random_Forest_precision = (Random_Forest_confusion_matrix[0][0]) / (Random_Forest_confusion_matrix[0][0] + Random_Forest_confusion_matrix[1][0])
print('Random_Forest Precision = ' + str(Random_Forest_precision))

Random_Forest Precision = 1.0


In [50]:
Random_Forest_recall = (Random_Forest_confusion_matrix[0][0]) / (Random_Forest_confusion_matrix[0][0] + Random_Forest_confusion_matrix[0][1])
print('RF Recall = ' + str(Random_Forest_recall))

RF Recall = 1.0


In [51]:
Random_Forest_f1Score = (Random_Forest_precision * Random_Forest_recall) / (Random_Forest_precision + Random_Forest_recall) * 2
print('RF F1 Score = ' + str(Random_Forest_f1Score))

RF F1 Score = 1.0


### Logistic Regression Model Performance

In [52]:
Logistic_Regression_Prediction_Labels = Logistic_Regression_Prediction.select(['prediction', 'LABEL'])

Logistic_Regression_KPI = MulticlassMetrics(Logistic_Regression_Prediction_Labels.rdd.map(list))

In [53]:
Logistic_Regression_confusion_matrix = Logistic_Regression_KPI.confusionMatrix().toArray()

print('LogReg Confusion Matrix')

print(Logistic_Regression_confusion_matrix)

LogReg Confusion Matrix

[[85453.     0.]

 [  130.     0.]]


In [54]:
Logistic_Regression_precision = (Logistic_Regression_confusion_matrix[0][0]) / (Logistic_Regression_confusion_matrix[0][0] + Logistic_Regression_confusion_matrix[1][0])
print('LogReg Precision = ' + str(Logistic_Regression_precision))

LogReg Precision = 0.9984810067419931


In [55]:
Logistic_Regression_recall = (Logistic_Regression_confusion_matrix[0][0]) / (Logistic_Regression_confusion_matrix[0][0] + Logistic_Regression_confusion_matrix[0][1])
print('LogReg Recall = ' + str(Logistic_Regression_recall))

LogReg Recall = 1.0


In [56]:
Logistic_Regression_f1Score = (Logistic_Regression_precision * Logistic_Regression_recall) / (Logistic_Regression_precision + Logistic_Regression_recall) * 2
print('LogReg F1 Score = ' + str(Logistic_Regression_f1Score))

LogReg F1 Score = 0.9992399260974297
