In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
# !pip install pyspark

In [3]:
#Import all necessary libraries and packages

import pyspark
import pyspark.sql.functions as funct
from pyspark.sql import SparkSession
import pandas as pd
import credentials as cred 
spark = SparkSession.builder.appName("credit_card_clean").getOrCreate()

In [4]:
credit_card_filepath = r'C:\Users\chito\Developer\Capstone_350\Raw_Data\cdw_sapp_credit.json'
credit_card_df = spark.read.option('multiLine', True).json(credit_card_filepath)



---


# **DATA EXPLORATION**

In [5]:
credit_card_df.printSchema()

root
 |-- BRANCH_CODE: long (nullable = true)
 |-- CREDIT_CARD_NO: string (nullable = true)
 |-- CUST_SSN: long (nullable = true)
 |-- DAY: long (nullable = true)
 |-- MONTH: long (nullable = true)
 |-- TRANSACTION_ID: long (nullable = true)
 |-- TRANSACTION_TYPE: string (nullable = true)
 |-- TRANSACTION_VALUE: double (nullable = true)
 |-- YEAR: long (nullable = true)



In [6]:
credit_card_df.show(5)

+-----------+----------------+---------+---+-----+--------------+----------------+-----------------+----+
|BRANCH_CODE|  CREDIT_CARD_NO| CUST_SSN|DAY|MONTH|TRANSACTION_ID|TRANSACTION_TYPE|TRANSACTION_VALUE|YEAR|
+-----------+----------------+---------+---+-----+--------------+----------------+-----------------+----+
|        114|4210653349028689|123459988| 14|    2|             1|       Education|             78.9|2018|
|         35|4210653349028689|123459988| 20|    3|             2|   Entertainment|            14.24|2018|
|        160|4210653349028689|123459988|  8|    7|             3|         Grocery|             56.7|2018|
|        114|4210653349028689|123459988| 19|    4|             4|   Entertainment|            59.73|2018|
|         93|4210653349028689|123459988| 10|   10|             5|             Gas|             3.59|2018|
+-----------+----------------+---------+---+-----+--------------+----------------+-----------------+----+
only showing top 5 rows



In [7]:
credit_card_df.count()

46694

In [8]:
credit_card_df.describe().show()

+-------+------------------+--------------------+-------------------+-----------------+-----------------+------------------+----------------+------------------+------+
|summary|       BRANCH_CODE|      CREDIT_CARD_NO|           CUST_SSN|              DAY|            MONTH|    TRANSACTION_ID|TRANSACTION_TYPE| TRANSACTION_VALUE|  YEAR|
+-------+------------------+--------------------+-------------------+-----------------+-----------------+------------------+----------------+------------------+------+
|  count|             46694|               46694|              46694|            46694|            46694|             46694|           46694|             46694| 46694|
|   mean| 75.00057823274939|4.210653353369430...|1.234555184812824E8|14.50736711354778|6.516875829871076|           23347.5|            NULL|  51.0393821475995|2018.0|
| stddev|51.389074910957966| 2.560464160562429E7| 2561.2609101349194| 8.06630502251638|3.453507942126967|13479.541071564714|            NULL|28.783264046884966|

## There are 46694 records in this file. Looking at the dataframe with describe(method) reveals that each field has the same count. Thus, there are no null or missing values to consider.There are 115 branch codes with branch numbers raning from 1 to 197 which was previously  confirmed in the other dataframes. The social security numbers are within the min and max range so they are all 16 digits long. Similarly, Social Security number s are within the min and max range are all 10 digists long, all months of the year are represented, and 28 days of the month. All transactions occurred in 2018.

### An important steop is to confirm that each credit card number has a unique identifier attached to it. If not, we can conclude there are errors or some customers have multiple credit cards which can be a reasonable assumption.

In [9]:
credit_card_df.select(funct.countDistinct("CREDIT_CARD_NO")).show()
credit_card_df.select(funct.countDistinct("CUST_SSN")).show()
credit_card_df.select(funct.countDistinct("BRANCH_CODE")).show()
credit_card_ssn = credit_card_df.select("CREDIT_CARD_NO","CUST_SSN").groupBy("CREDIT_CARD_NO","CUST_SSN").count()
credit_card_ssn.count()

+------------------------------+
|count(DISTINCT CREDIT_CARD_NO)|
+------------------------------+
|                           952|
+------------------------------+

+------------------------+
|count(DISTINCT CUST_SSN)|
+------------------------+
|                     952|
+------------------------+

+---------------------------+
|count(DISTINCT BRANCH_CODE)|
+---------------------------+
|                        114|
+---------------------------+



952

### The grouping indicates that there are an equal number of credit card numbers and unique customer social security numbers.

### Also, helpful to review if the transactions typify what you would expect with the use of credit cards, albeit some transactions may reflect personal preferences for customers that could be private.

In [10]:
credit_card_df.groupBy('TRANSACTION_TYPE').count().orderBy(funct.col('count').desc()).show()

+----------------+-----+
|TRANSACTION_TYPE|count|
+----------------+-----+
|           Bills| 6861|
|      Healthcare| 6723|
|            Test| 6683|
|       Education| 6638|
|   Entertainment| 6635|
|             Gas| 6605|
|         Grocery| 6549|
+----------------+-----+



### These are all very generic categories. Expected more variety; however, these are all within expectations of transaction types. The test category seems that it reflects test transactions when many times credit cards will be tested with small dollar amounts to verify the validity of the credit card or with certain transactions that require a hold.



---


# **TRANSFORMING THE DATA**

In [11]:
from pyspark.sql.functions import concat, col, expr, lpad

# Pad single-digit months with a leading zero
credit_card_df = credit_card_df.withColumn("MONTH", lpad(col("MONTH"), 2, '0'))

# Pad single-digit days with a leading zero
credit_card_df = credit_card_df.withColumn("DAY", lpad(col("DAY"), 2, '0'))

# Create a new column with the concatenated date fields
credit_card_df = credit_card_df.withColumn("TIMEID",  funct.format_string("%s%s%s",
            credit_card_df['YEAR'], credit_card_df['MONTH'], credit_card_df['DAY']))
# Convert the new column to a date format
#credit_card_df = credit_card_df.withColumn("TIMEID", expr("to_date(TIMEID, 'yyyyMMdd')"))
credit_card_df.select("DAY","MONTH","YEAR","TIMEID").show(5)
# Drop the original date columns
credit_card_df = credit_card_df.drop("YEAR", "MONTH", "DAY")

# Show the first five rows of the transformed dataframe
credit_card_df.show(5)

+---+-----+----+--------+
|DAY|MONTH|YEAR|  TIMEID|
+---+-----+----+--------+
| 14|   02|2018|20180214|
| 20|   03|2018|20180320|
| 08|   07|2018|20180708|
| 19|   04|2018|20180419|
| 10|   10|2018|20181010|
+---+-----+----+--------+
only showing top 5 rows

+-----------+----------------+---------+--------------+----------------+-----------------+--------+
|BRANCH_CODE|  CREDIT_CARD_NO| CUST_SSN|TRANSACTION_ID|TRANSACTION_TYPE|TRANSACTION_VALUE|  TIMEID|
+-----------+----------------+---------+--------------+----------------+-----------------+--------+
|        114|4210653349028689|123459988|             1|       Education|             78.9|20180214|
|         35|4210653349028689|123459988|             2|   Entertainment|            14.24|20180320|
|        160|4210653349028689|123459988|             3|         Grocery|             56.7|20180708|
|        114|4210653349028689|123459988|             4|   Entertainment|            59.73|20180419|
|         93|4210653349028689|123459988| 

### The Day, Month, and Year have been joined into a new column called TIMEID with the appropriate format of yyMMdd as per the mapping document.

In [12]:
#Reording the dataframe columns as described in the mapping document
credit_card_df = credit_card_df.select('TRANSACTION_ID','CREDIT_CARD_NO','TIMEID','CUST_SSN',
                     'BRANCH_CODE','TRANSACTION_TYPE','TRANSACTION_VALUE')


In [13]:
credit_card_df.write.format("jdbc") \
  .mode("overwrite") \
  .option("url", "jdbc:mysql://localhost:3306/creditcard_capstone") \
  .option("dbtable", "creditcard_capstone.CDW_SAPP_CREDIT_CARD") \
  .option("user", cred.user) \
  .option("password", cred.password) \
  .save()


In [14]:
pandas_df = credit_card_df.toPandas()
pandas_df.head()

Unnamed: 0,TRANSACTION_ID,CREDIT_CARD_NO,TIMEID,CUST_SSN,BRANCH_CODE,TRANSACTION_TYPE,TRANSACTION_VALUE
0,1,4210653349028689,20180214,123459988,114,Education,78.9
1,2,4210653349028689,20180320,123459988,35,Entertainment,14.24
2,3,4210653349028689,20180708,123459988,160,Grocery,56.7
3,4,4210653349028689,20180419,123459988,114,Entertainment,59.73
4,5,4210653349028689,20181010,123459988,93,Gas,3.59


In [15]:
!pip list

Package                   Version
------------------------- -----------
asttokens                 2.4.1
attrs                     23.2.0
blinker                   1.8.2
branca                    0.7.2
certifi                   2024.2.2
cffi                      1.16.0
charset-normalizer        3.3.2
click                     8.1.7
colorama                  0.4.6
comm                      0.2.2
contourpy                 1.2.1
cryptography              42.0.8
cycler                    0.12.1
dash                      2.17.0
dash-core-components      2.0.0
dash-html-components      2.0.0
dash-table                5.0.0
debugpy                   1.8.1
decorator                 5.1.1
executing                 2.0.1
fastjsonschema            2.19.1
findspark                 2.0.1
Flask                     3.0.3
folium                    0.17.0
fonttools                 4.51.0
greenlet                  3.0.3
grpcio                    1.64.1
grpcio-tools              1.64.1
idna               