<a href="https://colab.research.google.com/github/DataEtnos/Apache_Spark/blob/main/SPARKML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# Instalar a última versão do PySpark
!pip install pyspark #==3.3.1

# Instalar o NGROK
!wget -qnc https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip -n -q ngrok-stable-linux-amd64.zip


# Iniciar a sessão spark
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
      .config('spark.ui.port', '4050')
      .appName("SparkSQL")
      .getOrCreate()
)

# Autenticar a sessão do SparkUI com NGROK
!./ngrok authtoken 2pp0qW25AGqGYK6xftvt4Q4XXLv_JsvPyKtvYupn7z65UHQ5
get_ipython().system_raw('./ngrok http 4050 &')
!sleep 10
!curl -s http://localhost:4040/api/tunnels | grep -Po 'public_url":"(?=https)\K[^"]*'

Authtoken saved to configuration file: /root/.ngrok2/ngrok.yml


# **MODELAGEM DE COLUNAS**

In [7]:
from pyspark.sql.types import *

schema_remetente_destinatario = StructType([
    StructField('nome', StringType()),
    StructField('banco', StringType()),
    StructField('tipo', StringType())
])

schema_base_pix = StructType([
    StructField('id_transacao', IntegerType()),
    StructField('valor', DoubleType()),
    StructField('remetente', schema_remetente_destinatario),
    StructField('destinatario', schema_remetente_destinatario),
    StructField('chave_pix', StringType()),
    StructField('categoria', StringType()),
    StructField('transaction_date', StringType()),
    StructField('fraude', IntegerType())
])

caminho_json = '/content/case_final.json'
df = spark.read.json(
    caminho_json,
    schema=schema_base_pix,
    timestampFormat="yyyy-MM-dd HH:mm:ss")

In [8]:
df.show()



+------------+------------------+--------------------+--------------------+---------+-------------+-------------------+------+
|id_transacao|             valor|           remetente|        destinatario|chave_pix|    categoria|   transaction_date|fraude|
+------------+------------------+--------------------+--------------------+---------+-------------+-------------------+------+
|        1000|            588.08|{Jonathan Gonsalv...|{Calebe Melo, Cai...|aleatoria|       outros|2021-07-16 05:00:55|     0|
|        1001|           80682.5|{Jonathan Gonsalv...|{Davi Lucas Perei...|  celular|transferencia|2022-04-20 12:34:01|     1|
|        1002|             549.9|{Jonathan Gonsalv...|{Sabrina Castro, ...|      cpf|        lazer|2022-07-10 16:51:34|     0|
|        1003|             90.83|{Jonathan Gonsalv...|{Francisco da Con...|aleatoria|   transporte|2022-10-20 10:57:36|     0|
|        1004|13272.619999999999|{Jonathan Gonsalv...|{Isabelly Ferreir...|    email|transferencia|2021-04-06 2

In [9]:
df.printSchema()

root
 |-- id_transacao: integer (nullable = true)
 |-- valor: double (nullable = true)
 |-- remetente: struct (nullable = true)
 |    |-- nome: string (nullable = true)
 |    |-- banco: string (nullable = true)
 |    |-- tipo: string (nullable = true)
 |-- destinatario: struct (nullable = true)
 |    |-- nome: string (nullable = true)
 |    |-- banco: string (nullable = true)
 |    |-- tipo: string (nullable = true)
 |-- chave_pix: string (nullable = true)
 |-- categoria: string (nullable = true)
 |-- transaction_date: string (nullable = true)
 |-- fraude: integer (nullable = true)



In [10]:
from pyspark.sql.functions import *

df_flatten = df.withColumns({
    'destinatario_nome': col('destinatario').getField('nome'), #retirando de bibliotec a para lista
    'destinatario_banco': col('destinatario').getField('banco'),
    'destinatario_tipo': col('destinatario').getField('tipo'),
}).drop('remetente', 'destinatario') # dropando remetente

In [14]:
df_flatten.show()

+------------+------------------+---------+-------------+-------------------+------+--------------------+------------------+-----------------+
|id_transacao|             valor|chave_pix|    categoria|   transaction_date|fraude|   destinatario_nome|destinatario_banco|destinatario_tipo|
+------------+------------------+---------+-------------+-------------------+------+--------------------+------------------+-----------------+
|        1000|            588.08|aleatoria|       outros|2021-07-16 05:00:55|     0|         Calebe Melo|             Caixa|               PF|
|        1001|           80682.5|  celular|transferencia|2022-04-20 12:34:01|     1|  Davi Lucas Pereira|             Caixa|               PJ|
|        1002|             549.9|      cpf|        lazer|2022-07-10 16:51:34|     0|      Sabrina Castro|            Nubank|               PF|
|        1003|             90.83|aleatoria|   transporte|2022-10-20 10:57:36|     0|Francisco da Conc...|            Nubank|               PJ|

In [11]:
df_flatten.printSchema()

root
 |-- id_transacao: integer (nullable = true)
 |-- valor: double (nullable = true)
 |-- chave_pix: string (nullable = true)
 |-- categoria: string (nullable = true)
 |-- transaction_date: string (nullable = true)
 |-- fraude: integer (nullable = true)
 |-- destinatario_nome: string (nullable = true)
 |-- destinatario_banco: string (nullable = true)
 |-- destinatario_tipo: string (nullable = true)



# **TRANSFORMAÇÃO DE  FEATURES**

In [15]:
from pyspark.ml.feature import StringIndexer
# Transformando colunas do tipo texto para numero
indexer = StringIndexer(
    inputCols=[
        'destinatario_nome',
        'destinatario_banco',
        'destinatario_tipo',
        'categoria',
        'chave_pix'
    ],
    outputCols=[
        'destinatario_nome_index',
        'destinatario_banco_index',
        'destinatario_tipo_index',
        'categoria_index',
        'chave_pix_index'
    ]
)

df_index = indexer.fit(df_flatten).transform(df_flatten)

In [17]:
df_index.printSchema()

root
 |-- id_transacao: integer (nullable = true)
 |-- valor: double (nullable = true)
 |-- chave_pix: string (nullable = true)
 |-- categoria: string (nullable = true)
 |-- transaction_date: string (nullable = true)
 |-- fraude: integer (nullable = true)
 |-- destinatario_nome: string (nullable = true)
 |-- destinatario_banco: string (nullable = true)
 |-- destinatario_tipo: string (nullable = true)
 |-- destinatario_nome_index: double (nullable = false)
 |-- destinatario_banco_index: double (nullable = false)
 |-- destinatario_tipo_index: double (nullable = false)
 |-- categoria_index: double (nullable = false)
 |-- chave_pix_index: double (nullable = false)



In [16]:
df_index.show()

+------------+------------------+---------+-------------+-------------------+------+--------------------+------------------+-----------------+-----------------------+------------------------+-----------------------+---------------+---------------+
|id_transacao|             valor|chave_pix|    categoria|   transaction_date|fraude|   destinatario_nome|destinatario_banco|destinatario_tipo|destinatario_nome_index|destinatario_banco_index|destinatario_tipo_index|categoria_index|chave_pix_index|
+------------+------------------+---------+-------------+-------------------+------+--------------------+------------------+-----------------+-----------------------+------------------------+-----------------------+---------------+---------------+
|        1000|            588.08|aleatoria|       outros|2021-07-16 05:00:55|     0|         Calebe Melo|             Caixa|               PF|                12045.0|                     4.0|                    1.0|            6.0|            3.0|
|       

In [19]:
#separando os dados em fraude e não fraude  para verificar o modelo  e não enviesar o modelo
is_fraud = df_index.filter("fraude == 1") # filter(col('fraude') == 1)
no_fraud = df_index.filter("fraude == 0")

In [21]:
#O código seleciona uma amostra de 1% dos dados sem fraude
no_fraud = no_fraud.sample(False, 0.01, seed=123)

In [22]:
#O código combina os dados de fraude (is_fraud) e não fraude (no_fraud) em um único
df_concat = no_fraud.union(is_fraud)
df = df_concat.sort("transaction_date")
df.count()

15371

In [23]:
#separa em treino e teste de forma aleatoria  sendo 70% treino e 30% para teste
train, test = df.randomSplit([0.7, 0.3], seed = 123)
print("train =", train.count(), " test =", test.count())

train = 10701  test = 4670


In [24]:
is_fraud = udf(lambda fraud: 1.0 if fraud > 0 else 0.0, DoubleType())
train = train.withColumn("is_fraud", is_fraud(train.fraude))

In [25]:
train.printSchema()

root
 |-- id_transacao: integer (nullable = true)
 |-- valor: double (nullable = true)
 |-- chave_pix: string (nullable = true)
 |-- categoria: string (nullable = true)
 |-- transaction_date: string (nullable = true)
 |-- fraude: integer (nullable = true)
 |-- destinatario_nome: string (nullable = true)
 |-- destinatario_banco: string (nullable = true)
 |-- destinatario_tipo: string (nullable = true)
 |-- destinatario_nome_index: double (nullable = false)
 |-- destinatario_banco_index: double (nullable = false)
 |-- destinatario_tipo_index: double (nullable = false)
 |-- categoria_index: double (nullable = false)
 |-- chave_pix_index: double (nullable = false)
 |-- is_fraud: double (nullable = true)



In [26]:
train.columns

['id_transacao',
 'valor',
 'chave_pix',
 'categoria',
 'transaction_date',
 'fraude',
 'destinatario_nome',
 'destinatario_banco',
 'destinatario_tipo',
 'destinatario_nome_index',
 'destinatario_banco_index',
 'destinatario_tipo_index',
 'categoria_index',
 'chave_pix_index',
 'is_fraud']

In [27]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

assembler = VectorAssembler(
    inputCols = [x for x in train.columns if x not in ['transaction_date', 'fraude', 'is_fraud', 'destinatario_nome', 'destinatario_banco', 'destinatario_tipo', 'chave_pix', 'categoria']],
    outputCol="features")

In [28]:
lr = LogisticRegression().setParams(
    maxIter=100000,
    labelCol = "is_fraud",
    predictionCol="prediction"
)

In [29]:
model = Pipeline(stages=[assembler, lr]).fit(train)

In [30]:
predicted = model.transform(test)

In [31]:
predicted.show()

+------------+------------------+---------+-------------+-------------------+------+--------------------+------------------+-----------------+-----------------------+------------------------+-----------------------+---------------+---------------+--------------------+--------------------+--------------------+----------+
|id_transacao|             valor|chave_pix|    categoria|   transaction_date|fraude|   destinatario_nome|destinatario_banco|destinatario_tipo|destinatario_nome_index|destinatario_banco_index|destinatario_tipo_index|categoria_index|chave_pix_index|            features|       rawPrediction|         probability|prediction|
+------------+------------------+---------+-------------+-------------------+------+--------------------+------------------+-----------------+-----------------------+------------------------+-----------------------+---------------+---------------+--------------------+--------------------+--------------------+----------+
|        1011|          21345.91| 

In [32]:
predicted = predicted.withColumn('is_fraud', is_fraud(predicted.fraude))
predicted.crosstab('is_fraud', 'prediction').show()

+-------------------+---+----+
|is_fraud_prediction|0.0| 1.0|
+-------------------+---+----+
|                1.0|  0|4669|
|                0.0|  1|   0|
+-------------------+---+----+



In [33]:
df_teste_cols = [
    'id_transacao',
    'valor',
    'transaction_date',
    'destinatario_nome_index',
    'destinatario_banco_index',
    'destinatario_tipo_index',
    'chave_pix_index',
    'categoria_index',
    'fraude'
]

df_teste_data = [
    (999,103.2, "2023-01-01 11:56:41", 328.0, 4.0, 1.0, 3.0, 5.0, 0),
    (998, 500000.0, "2023-01-01 11:56:41", 328.0, 2.0, 3.0, 2.0, 5.0, 1),
    (997, 19999.0, "2023-01-01 11:56:41", 328.0, 1.0, 2.0, 1.0, 5.0, 0),
]

df_teste = spark.createDataFrame(df_teste_data).toDF(*df_teste_cols)

In [34]:
new_prediction = model.transform(df_teste)

In [35]:
new_prediction.show()

+------------+--------+-------------------+-----------------------+------------------------+-----------------------+---------------+---------------+------+--------------------+--------------------+-----------+----------+
|id_transacao|   valor|   transaction_date|destinatario_nome_index|destinatario_banco_index|destinatario_tipo_index|chave_pix_index|categoria_index|fraude|            features|       rawPrediction|probability|prediction|
+------------+--------+-------------------+-----------------------+------------------------+-----------------------+---------------+---------------+------+--------------------+--------------------+-----------+----------+
|         999|   103.2|2023-01-01 11:56:41|                  328.0|                     4.0|                    1.0|            3.0|            5.0|     0|[999.0,103.2,328....|[78.3947369321129...|  [1.0,0.0]|       0.0|
|         998|500000.0|2023-01-01 11:56:41|                  328.0|                     2.0|                    3.0|