<a href="https://colab.research.google.com/github/Adrian-Stahl/Project-4/blob/main/Credit_Card_Fraud_Detection_Machine_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#import dependencies

!pip install pyspark
!pip install findspark
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import RandomOverSampler

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317130 sha256=1161ec481e84d1f4d93e6ec6731104e12c2c1cce8f8667cc2ecbc140719faced
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 k

In [2]:
# setup Spark session
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [3]:
#mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# read in csv from Google Drive
file_path = '/content/drive/MyDrive/Bootcamp/transactions_train.csv'
df = spark.read.csv(file_path, header=True, inferSchema=True)

In [5]:
# display data frame
df.show()

+----+--------+---------+-----------+--------------+--------------+-----------+--------------+--------------+-------+
|step|    type|   amount|   nameOrig|oldbalanceOrig|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|
+----+--------+---------+-----------+--------------+--------------+-----------+--------------+--------------+-------+
|   1| PAYMENT|  9839.64|C1231006815|      170136.0|     160296.36|M1979787155|           0.0|           0.0|      0|
|   1| PAYMENT|  1864.28|C1666544295|       21249.0|      19384.72|M2044282225|           0.0|           0.0|      0|
|   1|TRANSFER|    181.0|C1305486145|         181.0|           0.0| C553264065|           0.0|           0.0|      1|
|   1|CASH_OUT|    181.0| C840083671|         181.0|           0.0|  C38997010|       21182.0|           0.0|      1|
|   1| PAYMENT| 11668.14|C2048537720|       41554.0|      29885.86|M1230701703|           0.0|           0.0|      0|
|   1| PAYMENT|  7817.71|  C90045638|       53860.0|    

In [6]:
# drop columns with string values
columns_to_drop = ["step","type","nameOrig", "nameDest"]
df = df.drop(*columns_to_drop)

In [7]:
#displaying the dataset with the dropped column
df.show()

+---------+--------------+--------------+--------------+--------------+-------+
|   amount|oldbalanceOrig|newbalanceOrig|oldbalanceDest|newbalanceDest|isFraud|
+---------+--------------+--------------+--------------+--------------+-------+
|  9839.64|      170136.0|     160296.36|           0.0|           0.0|      0|
|  1864.28|       21249.0|      19384.72|           0.0|           0.0|      0|
|    181.0|         181.0|           0.0|           0.0|           0.0|      1|
|    181.0|         181.0|           0.0|       21182.0|           0.0|      1|
| 11668.14|       41554.0|      29885.86|           0.0|           0.0|      0|
|  7817.71|       53860.0|      46042.29|           0.0|           0.0|      0|
|  7107.77|      183195.0|     176087.23|           0.0|           0.0|      0|
|  7861.64|     176087.23|     168225.59|           0.0|           0.0|      0|
|  4024.36|        2671.0|           0.0|           0.0|           0.0|      0|
|  5337.77|       41720.0|      36382.23

In [8]:
#Separating data into labels (y) and features (x)
y = df.select("isFraud").toPandas()
X = df.drop("isFraud").toPandas()

### Step 4: Split the data into training and testing datasets by using `train_test_split`.

In [9]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    random_state = 1
)

In [10]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
LR_model = LogisticRegression(random_state = 1)

In [11]:
# Fit the model using training data
LR_model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [12]:
# Make a prediction using the testing data
LR_predictions = LR_model.predict(X_test)

In [13]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, LR_predictions)

0.8904248433622179

In [14]:
# Generate a confusion matrix for the model
delta = confusion_matrix(y_test, LR_predictions)
delta_df = pd.DataFrame(delta, 
                                index = ['Actual Healthy Loans (low-risk)', 
                                'Actual Non-Healthy Loans (high-risk)'], 
                                columns = ['Predicted Healthy Loans (low-risk)', 'Predicted Non-Healthy Loans (high-risk)']
                              )
delta_df

Unnamed: 0,Predicted Healthy Loans (low-risk),Predicted Non-Healthy Loans (high-risk)
Actual Healthy Loans (low-risk),1583154,2718
Actual Non-Healthy Loans (high-risk),419,1508


In [15]:
# Print the classification report for the model
print(classification_report(y_test, LR_predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1585872
           1       0.36      0.78      0.49      1927

    accuracy                           1.00   1587799
   macro avg       0.68      0.89      0.74   1587799
weighted avg       1.00      1.00      1.00   1587799



---




In [16]:
# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
ROS_model = RandomOverSampler(random_state = 1)

# Fit the original training data to the random_oversampler model
X_oversampled, y_oversampled = ROS_model.fit_resample(X_train, y_train)

In [17]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
LR_oversampled_model = LogisticRegression(random_state = 1)

# Fit the model using the resampled training data
LR_oversampled_model.fit(X_oversampled, y_oversampled)

# Make a prediction using the testing data
LR_oversampled_pred = LR_oversampled_model.predict(X_test)

  y = column_or_1d(y, warn=True)


In [18]:
# Print the balanced_accuracy score of the model 
balanced_accuracy_score(y_test, LR_oversampled_pred)

0.8426175294757221

In [19]:
# Generate a confusion matrix for the model
epsilon = confusion_matrix(y_test, LR_oversampled_pred)
epsilon_df = pd.DataFrame(epsilon, 
                                index = ['Actual Healthy Loans (low-risk)', 
                                'Actual Non-Healthy Loans (high-risk)'], 
                                columns = ['Predicted Healthy Loans (low-risk)', 'Predicted Non-Healthy Loans (high-risk)']
                              )
epsilon_df

Unnamed: 0,Predicted Healthy Loans (low-risk),Predicted Non-Healthy Loans (high-risk)
Actual Healthy Loans (low-risk),1122083,463789
Actual Non-Healthy Loans (high-risk),43,1884


In [20]:
# Print the classification report for the model
print(classification_report(y_test, LR_oversampled_pred))

              precision    recall  f1-score   support

           0       1.00      0.71      0.83   1585872
           1       0.00      0.98      0.01      1927

    accuracy                           0.71   1587799
   macro avg       0.50      0.84      0.42   1587799
weighted avg       1.00      0.71      0.83   1587799

