In [0]:
from pyspark.sql.functions import when, lit, col, from_unixtime, regexp_replace, date_format, year, month, dayofmonth, regexp_extract
from pyspark.sql.types import StructType, StructField, LongType, StringType, TimestampType
 
inputPath = "/mnt/rawleonardo/rawzone/Twitter.csv"
 
ListaPos = r":\)|:\]|:P|:p|:s|:d|:D|:\}|;\)|;\]|;P|;p|;s|;d|;D|;\}|=\)|=\]|=P|=p|=d|=D|=\}|=S|:-\)|:-\]|:-P|:-p|:-s|:-d|:-D|:-\}|;-\)|;-\]|;-P|;-p|;-s|;-d|;-D|;-\}|=-\)|=-\]|=-P|=-p|=-d|=-D|=-\}|=-S|: \)|\(:|\( :|:- \)|😂|❤️|😍|🤣|😊|🙏|💕|😘|👍|😅|👏|😁|🔥|💖|😆|💪|😉|👌|🤗|😎|😇|🌹|🎉|💞|✌️|✨|😌|🌸|🙌|😋|😏|🙂|🤩|😄|😀|💯|🤭|❣️|😜|🙋|🤪|👊|💃|😚|😝|🙃|🍀|🌷|😻|✅|🌈|😈|🤘|✔️|💐|🎊|💘|🌺"
 
ListaNeg = r":\(|:\/|:c|:\\|:C|:\[|;\(|;\/|;c|;\\|;C|;\[|=\(|=\/|=c|=\\|=C|=\[|:-\(|:-\/|:-c|:-\\|:-C|:-\[|;-\(|;-\/|;-c|;-\\|;-C|;-\[|=-\(|=-\/|=-c|=-\\|=-C|=-\[|😭|😢|🤔|🙄|😔|🤦|😱|😒|😪|😑|😞|😩|😡|😥|😳|✋|😴|😬|😓|😣|🏃|☹️|😠|🥺|🤬"
 
rm_links = "(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})"
 
nomes = "Bolsonaro|bolsonaro|bonoro|Bonoro|bozo|Bozo|Jair|jair|bozonaro|Bozonaro|jairbolsonaro|Jairbolsonaro"
 
# Definindo o Schema
schema = StructType(
    [
    StructField("id", StringType()),
    StructField("text", StringType()),
    StructField("datetime", TimestampType())
    ]
)
 
df_b = (spark.read.csv(inputPath, schema = schema, sep = ";")) 
 
df_b = df_b.withColumn("datetime", date_format(col("datetime"), "yyyy-MM-dd hh:mm:ss")).withColumnRenamed("datetime", "tweet_data") \
           .withColumn("text", regexp_replace(col("text"), rm_links, "")) \
           .withColumn("Simbolo", regexp_extract(col("text"), f'{ListaPos}|{ListaNeg}', 0)) \
           .withColumn("Sentimento", when(col("Simbolo").rlike(ListaPos), "Positivo").when(col("Simbolo").rlike(ListaNeg), "Negativo").otherwise("Neutro")) \
           .withColumn("Ano", year("tweet_data")).withColumn("Mes", month("tweet_data")).withColumn("Dia", dayofmonth("tweet_data")) \
           .filter(col("text").rlike(nomes))
 
display(df_b)

In [0]:
# Salvar o dataframe já processado em Parquet, sendo particionados por "Ano/Mês/Dia"
 
df_b.write.mode('overwrite').partitionBy("Ano", "Mes", "Dia").format("parquet").save("/mnt/rawleonardo/refzone/Twitter2018") # Particionando o Arquivo em Ano/Mes/Dia e Salvando

In [0]:
# Juntar todos os arquivos particionados em um Parquet só, facilitando a leitura em PowerBI
 
df = spark.read.parquet("/mnt/rawleonardo/refzone/Twitter2018/Ano=2018")
df = df.coalesce(1).write.parquet("mnt/rawleonardo/refzone/Twitter2018/Coalesce2018")