## Import library

In [1]:
from pyspark import SparkContext
import math
import random

## Create spark context

In [2]:
sc = SparkContext.getOrCreate()

your 131072x1 screen size is bogus. expect trouble
25/04/11 14:18:31 WARN Utils: Your hostname, Cp resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/04/11 14:18:31 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/11 14:18:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/04/11 14:18:43 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## Read data

In [3]:
raw_data = sc.textFile("file:///home/cap/hadoop/hadoop-3.4.1/creditcard.csv")

## Preprocess data

As we have analysed in Structured API section, this data has a very high bias so we reduce the label 0's data down to 500 so the dataset can be more balance.

In [7]:
# remove header and read data
header = raw_data.first()
data = raw_data.filter(lambda line: line != header)
parsedData = data.map(\
                lambda line: [float(x.strip('"')) for x in line.split(",")])

# check missing value
def has_missing(row):
    for item in row:
        if item is None:
            return True
        if isinstance(item, str) and (item == "" or item.lower() in ['na', 'null', 'nan']):
            return True
    return False
missing_rows = parsedData.filter(has_missing).count()
print(f"Rows with missing values: {missing_rows}")

# remove duplicate values
rdd_data = parsedData.map(lambda cols: (tuple(cols[:-1]), cols[-1])).distinct()
rdd_data = rdd_data.map(lambda x: list((list(*x[:-1],), x[-1])))


# prepare data in proper format
def prepare_features(fields):
    features = [1.0] + fields[:-1][0]
    label = fields[-1]              
    return (features, label)
rdd_data = rdd_data.map(prepare_features).cache()


# fill 1 and 0 labels
data_0 = rdd_data.filter(lambda x: x[1] == 0.0)
data_1 = rdd_data.filter(lambda x: x[1] == 1.0)

# set seed
seed = 42
random.seed(seed)

# get random sample
data_0_count = data_0.count()
sample_ratio = 600 / data_0_count
data_0_sampled = data_0.sample(withReplacement=False, fraction=sample_ratio, seed=seed).take(500)
data_0_rdd = sc.parallelize(data_0_sampled)

rdd_data = data_0_rdd.union(data_1)



# split to train and test set
train, test = rdd_data.randomSplit(weights=[0.8, 0.2], seed=seed)

                                                                                

Rows with missing values: 0


                                                                                

## Z-score

In [5]:
def normalize_zscore(rdd):
    # get features from RDD
    features_rdd = rdd.map(lambda x: x[0])

    # mean and std
    num_features = len(features_rdd.first())
    means = [features_rdd.map(lambda x: x[i]).mean() for i in range(num_features)]
    stds = [features_rdd.map(lambda x: x[i]).stdev() for i in range(num_features)]

    def zscore(features, means, stds):
        return [(f - means[i]) / stds[i] if stds[i] != 0 else 0 for i, f in enumerate(features)]


    # normalize
    normalized_rdd = rdd.map(lambda x: (zscore(x[0], means, stds), x[1]))
    return normalized_rdd

train = normalize_zscore(train)
test = normalize_zscore(test)

                                                                                

## Functions for training model

In [None]:
def dot(weights, features):
    return sum(w * f for w, f in zip(weights, features))

def sigmoid(z):
    try:
        return 1.0 / (1.0 + math.exp(-z))
    except OverflowError:
        return 0.0 if z < 0 else 1.0

def compute_gradient(weights, features, label):
    z = dot(weights, features)
    prediction = sigmoid(z)
    error = prediction - label
    return [error * f for f in features]


def logistic_regression(data_rdd, learning_rate=0.01, iterations=20, batch_size=32):
    num_features = len(data_rdd.first()[0])
    # generate random weights
    weights = [random.uniform(-0.01, 0.01) for _ in range(num_features)]

    batched_rdd = data_rdd.zipWithIndex().groupBy(lambda x: x[1] // batch_size).mapValues(list)
    batches = batched_rdd.map(lambda x: [item[0] for item in x[1]]).collect()

    for epoch in range(iterations):
        for batch in batches:
            gradients = sc.parallelize(batch).map(lambda x: compute_gradient(weights, x[0], x[1]))
            avg_gradient = gradients.reduce(lambda a, b: [x + y for x, y in zip(a, b)])
            avg_gradient = [g / len(batch) for g in avg_gradient]
            weights = [w - learning_rate * g  for w, g in zip(weights, avg_gradient)]

        train_pred = data_rdd.map(lambda x: (1 if sigmoid(dot(weights, x[0])) >= 0.5 else 0, x[1]))
        train_acc = train_pred.filter(lambda x: x[0] == x[1]).count() / data_rdd.count()
        print(f"Epoch {epoch+1}, Train Accuracy: {train_acc:.4f}")

    return weights

## Testing model

Here I will experiment with two diffenrent learning rate (0.01 and 0.1) and 2 different epoch (1 and 20)

In [None]:
print('Low-Level Operations')


# lr = 0.1, epoch = 20
final_weights = logistic_regression(train, learning_rate=0.001, iterations=10, batch_size=32)

# evaluate on test set
def evaluate_with_threshold(predictions_rdd, threshold):
    binary_pred = predictions_rdd.map(lambda x: (1 if x[0] >= threshold else 0, x[1]))
    accuracy = binary_pred.filter(lambda x: x[0] == x[1]).count() / test.count()
    tp = binary_pred.filter(lambda x: x[0] == 1 and x[1] == 1).count()
    fp = binary_pred.filter(lambda x: x[0] == 1 and x[1] == 0).count()
    fn = binary_pred.filter(lambda x: x[0] == 0 and x[1] == 1).count()
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    return accuracy, precision, recall

predictions_rdd = test.map(lambda x: (sigmoid(dot(final_weights, x[0])), x[1]))
for t in [0.3, 0.4, 0.5, 0.6]:
    acc, prec, rec = evaluate_with_threshold(predictions_rdd, t)
    print(f"Threshold {t}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}")

Low-Level Operations


                                                                                

Epoch 1, Train Accuracy: 0.8642


                                                                                

Epoch 2, Train Accuracy: 0.8756


                                                                                

Epoch 3, Train Accuracy: 0.8782


                                                                                

Epoch 4, Train Accuracy: 0.8782


                                                                                

Epoch 5, Train Accuracy: 0.8782


                                                                                

Epoch 6, Train Accuracy: 0.8794


                                                                                

Epoch 7, Train Accuracy: 0.8807


                                                                                

Epoch 8, Train Accuracy: 0.8807


                                                                                

Epoch 9, Train Accuracy: 0.8820


                                                                                

Epoch 10, Train Accuracy: 0.8820
Threshold 0.3: Accuracy=0.5297, Precision=0.5297, Recall=1.0000


                                                                                

Threshold 0.4: Accuracy=0.8865, Precision=0.8738, Recall=0.9184


                                                                                

Threshold 0.5: Accuracy=0.8595, Precision=1.0000, Recall=0.7347


                                                                                

Threshold 0.6: Accuracy=0.6865, Precision=1.0000, Recall=0.4082


                                                                                

In [None]:
print('Low-Level Operations')


# lr = 0.1, epoch = 20
final_weights = logistic_regression(train, learning_rate=0.1, iterations=20, batch_size=32)

# evaluate on test set
def evaluate_with_threshold(predictions_rdd, threshold):
    binary_pred = predictions_rdd.map(lambda x: (1 if x[0] >= threshold else 0, x[1]))
    accuracy = binary_pred.filter(lambda x: x[0] == x[1]).count() / test.count()
    tp = binary_pred.filter(lambda x: x[0] == 1 and x[1] == 1).count()
    fp = binary_pred.filter(lambda x: x[0] == 1 and x[1] == 0).count()
    fn = binary_pred.filter(lambda x: x[0] == 0 and x[1] == 1).count()
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    return accuracy, precision, recall

predictions_rdd = test.map(lambda x: (sigmoid(dot(final_weights, x[0])), x[1]))
for t in [0.3, 0.4, 0.5, 0.6]:
    acc, prec, rec = evaluate_with_threshold(predictions_rdd, t)
    print(f"Threshold {t}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}")

Low-Level Operations


                                                                                

Epoch 1, Train Accuracy: 0.8909


                                                                                

Epoch 2, Train Accuracy: 0.8959


                                                                                

Epoch 3, Train Accuracy: 0.8959


                                                                                

Epoch 4, Train Accuracy: 0.8972


                                                                                

Epoch 5, Train Accuracy: 0.8985


                                                                                

Epoch 6, Train Accuracy: 0.9048


                                                                                

Epoch 7, Train Accuracy: 0.9086


                                                                                

Epoch 8, Train Accuracy: 0.9124
Epoch 9, Train Accuracy: 0.9162
Epoch 10, Train Accuracy: 0.9162


                                                                                

Epoch 11, Train Accuracy: 0.9188


                                                                                

Epoch 12, Train Accuracy: 0.9175
Epoch 13, Train Accuracy: 0.9201


                                                                                

Epoch 14, Train Accuracy: 0.9201
Epoch 15, Train Accuracy: 0.9201


                                                                                

Epoch 16, Train Accuracy: 0.9175
Epoch 17, Train Accuracy: 0.9175
Epoch 18, Train Accuracy: 0.9188


                                                                                

Epoch 19, Train Accuracy: 0.9188


                                                                                

Epoch 20, Train Accuracy: 0.9188


                                                                                

Threshold 0.3: Accuracy=0.9189, Precision=0.9663, Recall=0.8776


                                                                                

Threshold 0.4: Accuracy=0.9243, Precision=0.9884, Recall=0.8673


                                                                                

Threshold 0.5: Accuracy=0.9081, Precision=0.9880, Recall=0.8367


                                                                                

Threshold 0.6: Accuracy=0.8757, Precision=1.0000, Recall=0.7653


                                                                                

In [None]:
sc.stop()