# 01 word count

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, lower, col, count

# Initialize a Spark Session
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("WordCount") \
    .getOrCreate()

In [2]:
# data path
file_path = "../data/le_petit_prince.txt"
# Read text file into DataFrame
# Each line becomes a row in the DataFrame
df = spark.read.text(file_path)

In [3]:
# Process the text and count words
word_counts = df \
    .select(explode(split(lower(col("value")), "\\s+")).alias("word")) \
    .filter(col("word") != "") \
    .groupBy("word") \
    .count() \
    .orderBy("count", ascending=False)

In [4]:
# Show top 20 most frequent words
word_counts.show(50)  

+-------+-----+
|   word|count|
+-------+-----+
|     le|  454|
|      -|  434|
|     de|  428|
|     je|  316|
|     et|  283|
|     il|  260|
|    les|  249|
|     un|  230|
|     la|  219|
|  petit|  193|
|      à|  178|
|     ne|  169|
|    que|  154|
|    pas|  148|
|     tu|  136|
|    des|  131|
|  c'est|  126|
|    dit|  125|
|      ?|  125|
|      !|  123|
|   mais|  123|
|    une|  123|
|     me|  103|
|   pour|   98|
|    qui|   97|
|     ce|   94|
|   bien|   92|
|     se|   85|
|      :|   84|
|  comme|   80|
|     en|   79|
|    est|   77|
| prince|   74|
|     du|   71|
|     si|   67|
|     ça|   66|
|    sur|   66|
|   dans|   63|
|   j'ai|   62|
|    mon|   61|
|   elle|   58|
|   plus|   53|
|     au|   51|
|prince.|   50|
|   tout|   49|
|   très|   49|
|    lui|   47|
|    par|   47|
|    son|   46|
|     on|   46|
+-------+-----+
only showing top 50 rows



In [5]:
# Save results if needed
output_file_path = "../data/out/le_petit_prince_count"
word_counts.write.csv(output_file_path, header=True)

In [None]:
# Stop the Spark session
spark.stop()