### Import Libraries

In [39]:
from pyspark.sql import SparkSession
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.evaluation import MulticlassMetrics, BinaryClassificationMetrics
import random

### Create session

In [18]:
spark = SparkSession.builder \
    .appName("mllib_classification") \
    .config("spark.driver.host", "localhost") \
    .getOrCreate()

### Prepare data and preprocessing

In [None]:
file_path = "creditcard.csv"

In [23]:
data = spark.read.csv(file_path, header=True, inferSchema=True)

In [24]:
data_drop_duplicates = data.dropDuplicates()

In [28]:
def parse_line(row):
    """Convert a row to a LabeledPoint"""
    features = [row[i] for i in range(len(row)-1)]  # All columns except Class
    label = row[-1]  # Class is the label
    return LabeledPoint(label, features)

In [30]:
rdd_data = data_drop_duplicates.rdd.map(parse_line)



In [None]:
class_0_rdd = rdd_data.filter(lambda lp: lp.label == 0.0)  

sampled_class_0 = spark.sparkContext.parallelize(
    class_0_rdd.takeSample(withReplacement=False, num=500, seed=42)
)

class_1_rdd = rdd_data.filter(lambda lp: lp.label == 1.0)

balanced_rdd = sampled_class_0.union(class_1_rdd)

shuffled_rdd = balanced_rdd.map(lambda lp: (random.random(), lp)) \
                          .sortByKey() \
                          .map(lambda x: x[1])

                                                                                

### Split data to train and test

In [34]:
train_data, test_data = shuffled_rdd.randomSplit([0.8, 0.2], seed=42)

### Training model 

In [35]:
model = LogisticRegressionWithLBFGS.train(
    train_data,
    iterations=150,  # Number of iterations
    regParam=0.001,   # Regularization parameter
    regType='l2',    # Type of regularization (L2)
    intercept=True   # Whether to add intercept
)

25/04/11 00:21:08 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


### Testing model

In [36]:
predictions = test_data.map(lambda lp: (float(model.predict(lp.features)), lp.label))

### Evaluation

In [42]:
multiclass_metrics = MulticlassMetrics(predictions)

print("\nAccuracy:", multiclass_metrics.accuracy)
print("Precision (Fraud):", multiclass_metrics.precision(1.0))
print("Recall (Fraud):", multiclass_metrics.recall(1.0))
binary_metrics = BinaryClassificationMetrics(predictions)
print("\nAUC:", binary_metrics.areaUnderROC)


Accuracy: 0.946524064171123
Precision (Fraud): 0.9390243902439024
Recall (Fraud): 0.9390243902439024

AUC: 0.9457026713124274
