In [4]:
# Imports
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StringType

from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    Tokenizer,
    StopWordsRemover,
    NGram,
    HashingTF,
    VectorAssembler,
)

# -----------------------------------------------------------
# Fonction 1 : nettoyage du texte
# -----------------------------------------------------------

def clean_text_column(df, text_col="text", output_col="clean_text"):
    """
    Nettoie une colonne de texte :
    - minuscules
    - suppression URL
    - suppression caractères non alphabétiques
    - réduction espaces
    """

    col = F.lower(F.col(text_col))
    col = F.regexp_replace(col, r"http\S+|www\.\S+", " ")
    col = F.regexp_replace(col, r"[^a-z\s]", " ")
    col = F.regexp_replace(col, r"\s+", " ")
    col = F.trim(col)

    return df.withColumn(output_col, col)

ModuleNotFoundError: No module named 'distutils'

In [6]:
def build_pipeline(num_features=262144):
    """
    Pipeline Spark :
    1. Tokenizer
    2. StopWordsRemover
    3. NGram (2-gram)
    4. HashingTF sur unigrams
    5. HashingTF sur bigrams
    6. VectorAssembler -> features
    """

    tokenizer = Tokenizer(
        inputCol="clean_text",
        outputCol="tokens"
    )

    remover = StopWordsRemover(
        inputCol="tokens",
        outputCol="unigrams"
    )

    bigrammer = NGram(
        n=2,
        inputCol="unigrams",
        outputCol="bigrams"
    )

    hashing_unigrams = HashingTF(
        numFeatures=num_features,
        inputCol="unigrams",
        outputCol="unigram_features"
    )

    hashing_bigrams = HashingTF(
        numFeatures=num_features,
        inputCol="bigrams",
        outputCol="bigram_features"
    )

    assembler = VectorAssembler(
        inputCols=["unigram_features", "bigram_features"],
        outputCol="features"
    )

    pipeline = Pipeline(stages=[
        tokenizer,
        remover,
        bigrammer,
        hashing_unigrams,
        hashing_bigrams,
        assembler
    ])

    return pipeline

In [None]:
import pyspark
from pyspark.sql import SparkSession

print("Version de PySpark :", pyspark.__version__)

spark = (
    SparkSession.builder
    .master("local[*]")  # ⚠️ TRÈS IMPORTANT : on force Spark à tourner en local
    .appName("Test_Spark_Minimal")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("INFO")

print("SparkContext master :", spark.sparkContext.master)
print("Spark version :", spark.version)

Version de PySpark : 4.0.1


In [None]:
path_train = "./train.csv"   # à adapter
path_output = "./train_parquet_notebook"   # dossier de sortie

df = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv(path_train)
)

df.show(5)
df.printSchema()

print("✓ CSV chargé")

In [None]:
df = df.select(
    F.col("sentiment").alias("label"),
    F.col("text").cast(StringType())
).dropna(subset=["label", "text"])

df.show(5)
print("Nombre de lignes :", df.count())

In [None]:
df_clean = clean_text_column(df, text_col="text", output_col="clean_text")

df_clean.select("text", "clean_text").show(10, truncate=False)
print("✓ Texte nettoyé")

In [None]:
pipeline = build_pipeline(num_features=262144)
print("✓ Pipeline créé")

In [None]:
print("⏳ Entraînement du pipeline...")
model = pipeline.fit(df_clean)

print("⏳ Transformation...")
df_final = model.transform(df_clean)

df_final.select("label", "features").show(5, truncate=False)

print("✓ Transformation terminée")

In [None]:
print("⏳ Entraînement du pipeline...")
model = pipeline.fit(df_clean)

print("⏳ Transformation...")
df_final = model.transform(df_clean)

df_final.select("label", "features").show(5, truncate=False)

print("✓ Transformation terminée")

In [None]:
spark.stop()
print("✓ Spark arrêté")