In [3]:
import re 
import os
import json
from bs4 import BeautifulSoup

from pyspark.sql import Row
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf,col,regexp_extract,regexp_replace


sc = SparkContext.getOrCreate()
sqlsc = SQLContext(sc)



In [15]:
def clean_timestamp(timestamp):
    months = {
        "janvier":"01",
        "février":"02",
        "mars":"03",
        "avril":"04",
        "mai":"05",
        "juin":"06",
        "juillet":"07",
        "août":"08",
        "septembre":"09",
        "octobre":"10",
        "novembre":"11",
        "décembre":"12"}
    day_regex  = "(\d{2})\s[^à]" #first group only
    year_regex = "\d{4}"
    time_regex = "\d{2}:\d{2}:\d{2}"
    day_regex  = re.compile(day_regex)
    year_regex = re.compile(year_regex)
    time_regex = re.compile(time_regex)
    day = day_regex.search(timestamp).group(1)
    year = year_regex.search(timestamp).group(0)
    time = time_regex.search(timestamp).group(0).replace(":","")
    final_timestamp = f"{day}-{month}-{year}-{time}"
    return final_timestamp
    
def clean_post_text(post_text):
    #Parsing html
    clean_text = ""
    soup = BeautifulSoup(post_text,"html.parser")
    for tag in soup("blockquote.blockquote-jv"):
        tag.clear()
        
    #Boucler sur la liste d'élements à l'envers est le meilleur moment 
    for elem in list(soup)[::-1]:
        if elem.name == "p":
            clean_text += elem.get_text()
        else:
            break
    
    return clean_text

In [24]:
posts_df = sqlsc.read.json("./posts")

udf_clean_text = udf(lambda x : clean_post_text(x),StringType())
posts_df = posts_df.withColumn("post_text",udf_clean_text(col("post_text")))

posts_df = posts_df.withColumn("post_text",regexp_replace("post_text",post_regex,""))
timestamp_regex = "(\d{2})\s(\D{4,9})\s(\d{4})\s[à]\s(\d{2}:\d{2}:\d{2})"
#g1:day|g2:month|g3:year|g4:time
months = {
        "janvier":"01",
        "février":"02",
        "mars":"03",
        "avril":"04",
        "mai":"05",
        "juin":"06",
        "juillet":"07",
        "août":"08",
        "septembre":"09",
        "octobre":"10",
        "novembre":"11",
        "décembre":"12"}
months_udf = udf(lambda x : months[x],StringType())

posts_df = posts_df.withColumn("day", regexp_extract(col("timestamp"),timestamp_regex,1))
posts_df = posts_df.withColumn("month", regexp_extract(col("timestamp"),timestamp_regex,2))
posts_df = posts_df.withColumn("month",months_udf(col("month")))
posts_df = posts_df.withColumn("year", regexp_extract(col("timestamp"),timestamp_regex,3))
posts_df = posts_df.withColumn("time", regexp_extract(col("timestamp"),timestamp_regex,4))
#Regex spark pour supprimer le texte restant après nettoyage
posts_df.show(100)

+--------------------+---------------+----+----------+--------------------+--------------------+--------------------+--------+---+-----+----+--------+
|                 _id|         author|page|   post_id|           post_text|           text_hash|           timestamp|topic_id|day|month|year|    time|
+--------------------+---------------+----+----------+--------------------+--------------------+--------------------+--------+---+-----+----+--------+
|624ac9f383bc09a13...|      judgeDoom|  15|1161937822|Non mais il va pa...|PHA+TGUgICAgICAgI...|04 avril 2022 à 0...|69373406| 04|   04|2022|00:06:07|
|624ac9f383bc09a13...|     LatinDeter|  15|1161937902|Nul à chier sérieux.|PHA+TGUgICAgICAgI...|04 avril 2022 à 0...|69373406| 04|   04|2022|00:07:14|
|624ac9f383bc09a13...|    AnjouAstraI|  15|1161937918|         Ben voyons |PHA+TGUgICAgICAgI...|04 avril 2022 à 0...|69373406| 04|   04|2022|00:07:18|
|624ac9f383bc09a13...|      judgeDoom|  15|1161938046|Chez tous ceux qu...|PHA+TGUgICAgICAgI..