In [1]:
!pip install pyspark
!pip install findspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

In [3]:
# setup Spark session
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [4]:
#mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [5]:
# read in csv from Google Drive
file_path = '/content/drive/MyDrive/Bootcamp/transactions_train.csv'
df = spark.read.csv(file_path, header=True, inferSchema=True)

In [6]:
# display data frame
df.show()

+----+--------+---------+-----------+--------------+--------------+-----------+--------------+--------------+-------+
|step|    type|   amount|   nameOrig|oldbalanceOrig|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|
+----+--------+---------+-----------+--------------+--------------+-----------+--------------+--------------+-------+
|   1| PAYMENT|  9839.64|C1231006815|      170136.0|     160296.36|M1979787155|           0.0|           0.0|      0|
|   1| PAYMENT|  1864.28|C1666544295|       21249.0|      19384.72|M2044282225|           0.0|           0.0|      0|
|   1|TRANSFER|    181.0|C1305486145|         181.0|           0.0| C553264065|           0.0|           0.0|      1|
|   1|CASH_OUT|    181.0| C840083671|         181.0|           0.0|  C38997010|       21182.0|           0.0|      1|
|   1| PAYMENT| 11668.14|C2048537720|       41554.0|      29885.86|M1230701703|           0.0|           0.0|      0|
|   1| PAYMENT|  7817.71|  C90045638|       53860.0|    

In [7]:
# drop columns with string values
columns_to_drop = ["step","type","nameOrig", "nameDest"]
df = df.drop(*columns_to_drop)

In [8]:
df.show()

+---------+--------------+--------------+--------------+--------------+-------+
|   amount|oldbalanceOrig|newbalanceOrig|oldbalanceDest|newbalanceDest|isFraud|
+---------+--------------+--------------+--------------+--------------+-------+
|  9839.64|      170136.0|     160296.36|           0.0|           0.0|      0|
|  1864.28|       21249.0|      19384.72|           0.0|           0.0|      0|
|    181.0|         181.0|           0.0|           0.0|           0.0|      1|
|    181.0|         181.0|           0.0|       21182.0|           0.0|      1|
| 11668.14|       41554.0|      29885.86|           0.0|           0.0|      0|
|  7817.71|       53860.0|      46042.29|           0.0|           0.0|      0|
|  7107.77|      183195.0|     176087.23|           0.0|           0.0|      0|
|  7861.64|     176087.23|     168225.59|           0.0|           0.0|      0|
|  4024.36|        2671.0|           0.0|           0.0|           0.0|      0|
|  5337.77|       41720.0|      36382.23

In [9]:
y = df.select("isFraud").toPandas()
X = df.drop("isFraud").toPandas()

In [10]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    random_state = 1
)

In [12]:
# importing StandardScaler
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [13]:
# importing tensor flow
import tensorflow as tf

# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=80, activation="relu", input_dim=X_train_scaled.shape[1]))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=30, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 80)                480       
                                                                 
 dense_1 (Dense)             (None, 30)                2430      
                                                                 
 dense_2 (Dense)             (None, 1)                 31        
                                                                 
Total params: 2,941
Trainable params: 2,941
Non-trainable params: 0
_________________________________________________________________


In [14]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [15]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [16]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

49619/49619 - 70s - loss: 0.0027 - accuracy: 0.9995 - 70s/epoch - 1ms/step
Loss: 0.0027400129474699497, Accuracy: 0.9994659423828125
