### Configurando ambiente

In [13]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from functools import reduce

# Iniciar SparkSession
spark = SparkSession.builder \
    .appName("Criação de github users Table") \
    .getOrCreate()


### Data Load

In [8]:
get_json_file = "../jsons/users_microdata.json"

df = spark.read.json(get_json_file, multiLine=True)

#### Data Transformation

In [9]:
# Ajustando company name
df = df.withColumn('company', f.regexp_replace(f.col('company'), '@', ''))

In [10]:
# Ajustando created_at
df = df.withColumn('created_at', f.unix_timestamp(f.col('created_at'), "yyyy-MM-dd'T'HH:mm:ss'Z'")) \
       .withColumn('created_at', f.date_format(f.col('created_at').cast('timestamp'), 'dd/MM/yyyy'))

In [14]:
# Retirando separador de todos os casos dado que irei salvar em csv
df = reduce(
    lambda temp_df, col_name: temp_df.withColumn(col_name, f.regexp_replace(f.col(col_name), ';', '')),
    df.columns,
    df
)

In [16]:
# Ajuste de string para company, bio e name
df = reduce(
    lambda temp_df,col_name: temp_df.withColumn(col_name, f.trim(f.col(col_name))),
    ['bio', 'company', 'name'],
    df
)

In [17]:
df.printSchema()

root
 |-- bio: string (nullable = true)
 |-- blog: string (nullable = true)
 |-- company: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- email: string (nullable = true)
 |-- followers: string (nullable = true)
 |-- following: string (nullable = true)
 |-- name: string (nullable = true)
 |-- public_repos: string (nullable = true)



In [18]:
df.show()

+-------------------------------------+--------------------+---------------+----------+--------------------+---------+---------+--------------------+------------+
|                                  bio|                blog|        company|created_at|               email|followers|following|                name|public_repos|
+-------------------------------------+--------------------+---------------+----------+--------------------+---------+---------+--------------------+------------+
|一切好与不好都始于第一步，勇敢地踏...|https://blog.csdn...|            DUT|25/10/2016|   2559820564@qq.com|      100|       19|            JosonLee|          37|
|                                 NULL| https://chrisxu.wtf|           Meta|31/01/2013|  github@chrisxu.wtf|      100|       47|            Chris Xu|          19|
|                                 NULL|                    |           NULL|22/02/2024|                NULL|      100|       83|                NULL|           0|
|                 Software Engineer...|

### Data Saving

In [23]:
df.write.mode('overwrite').option('sep', ';').option('header', 'true').csv('../data/githubUsersTable')  