In [None]:
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!pip install pyspark


Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:11 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,717 kB]
Get:12 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,245 kB]
Get:13 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [8

In [None]:
# Installer Java
!apt-get install openjdk-11-jdk-headless -qq > /dev/null

# Télécharger Spark depuis l'archive Apache
!wget https://archive.apache.org/dist/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3.tgz

# Extraire Spark
!tar -xvzf spark-3.4.1-bin-hadoop3.tgz

# Installer PySpark
!pip install -q pyspark



--2025-05-07 19:37:22--  https://archive.apache.org/dist/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3.tgz
Resolving archive.apache.org (archive.apache.org)... 65.108.204.189, 2a01:4f9:1a:a084::2
Connecting to archive.apache.org (archive.apache.org)|65.108.204.189|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 388341449 (370M) [application/x-gzip]
Saving to: ‘spark-3.4.1-bin-hadoop3.tgz’


2025-05-07 19:37:42 (19.3 MB/s) - ‘spark-3.4.1-bin-hadoop3.tgz’ saved [388341449/388341449]

spark-3.4.1-bin-hadoop3/
spark-3.4.1-bin-hadoop3/R/
spark-3.4.1-bin-hadoop3/R/lib/
spark-3.4.1-bin-hadoop3/R/lib/sparkr.zip
spark-3.4.1-bin-hadoop3/R/lib/SparkR/
spark-3.4.1-bin-hadoop3/R/lib/SparkR/html/
spark-3.4.1-bin-hadoop3/R/lib/SparkR/html/R.css
spark-3.4.1-bin-hadoop3/R/lib/SparkR/html/00Index.html
spark-3.4.1-bin-hadoop3/R/lib/SparkR/INDEX
spark-3.4.1-bin-hadoop3/R/lib/SparkR/help/
spark-3.4.1-bin-hadoop3/R/lib/SparkR/help/aliases.rds
spark-3.4.1-bin-hadoop3/R/lib/SparkR/help/A

In [None]:
import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.4.1-bin-hadoop3"


In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("WikipediaArticlesAnalysis") \
    .getOrCreate()


In [6]:
from google.colab import files
uploaded = files.upload()


Saving wiki.txt to wiki.txt


In [16]:
# Read the lines from wiki.txt
lines = spark.read.text("wiki.txt").rdd.map(lambda r: r[0])

# Properly split into url, title, and text
def split_line(line):
    parts = line.split('\t')
    url = parts[0] if len(parts) > 0 else ""
    title = parts[1] if len(parts) > 1 else ""
    text = "\t".join(parts[2:]) if len(parts) > 2 else ""
    return (url, title, text)

# Apply the correct splitting function
articles = lines.map(split_line)

# Create the DataFrame
articles_df = articles.toDF(["url", "title", "text"])

# Show the first 5 rows
articles_df.show(5, truncate=False)


+-------------------------------------+-------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [10]:
from pyspark.sql.functions import explode, split, length, col

# 1. Split the text into words
words_df = articles_df.select(
    explode(
        split(col("text"), r"\s+")
    ).alias("word")
)

# 2. Remove empty strings
words_df = words_df.filter(col("word") != "")

# 3. Calculate the length of each word and find the longest one
longest_word = words_df.withColumn(
    "word_length", length(col("word"))
).orderBy(
    col("word_length").desc()
).limit(1)

# 4. Show the longest word
longest_word.show(truncate=False)


+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+
|word                                                                                                                                                                                                                                                                                                                                                                                              |word_length|
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [11]:
from pyspark.sql.functions import avg

# 1. Calculate the average word length
average_word_length = words_df.withColumn(
    "word_length", length(col("word"))
).select(
    avg("word_length").alias("average_word_length")
)

# 2. Show the result
average_word_length.show()


+-------------------+
|average_word_length|
+-------------------+
|   6.54196617125131|
+-------------------+



In [12]:
from pyspark.sql.functions import regexp_extract, lower, count

# 1. Keep only Latin words (letters A-Z or a-z)
latin_words_df = words_df.withColumn(
    "latin_word",
    regexp_extract(col("word"), r"\b([A-Za-z]+)\b", 1)
).filter(
    col("latin_word") != ""
)

# 2. Count frequency of each Latin word
latin_word_counts = latin_words_df.groupBy(
    lower(col("latin_word")).alias("word")
).agg(
    count("*").alias("count")
)

# 3. Find the most frequent Latin word
most_frequent_latin_word = latin_word_counts.orderBy(
    col("count").desc()
).limit(1)

# 4. Show the result
most_frequent_latin_word.show(truncate=False)


+----+-----+
|word|count|
+----+-----+
|c   |4439 |
+----+-----+



In [13]:
from pyspark.sql.functions import when, sum as _sum

# 1. Create a new column to check if the word starts with an uppercase letter
words_with_capital = words_df.withColumn(
    "starts_with_capital",
    when(
        col("word").rlike(r"^[A-ZА-Я]"),  # A-Z (anglais) + А-Я (russe)
        1
    ).otherwise(0)
)

# 2. Group by the lowercase version of the word
word_capital_counts = words_with_capital.groupBy(
    lower(col("word")).alias("word")
).agg(
    count("*").alias("total_count"),
    _sum("starts_with_capital").alias("capital_count")
)

# 3. Filter: more than 10 occurrences and more than 50% starting with capital
final_words = word_capital_counts.filter(
    (col("total_count") > 10) &
    (col("capital_count") / col("total_count") > 0.5)
)

# 4. Show the result
final_words.show(truncate=False)


+--------------+-----------+-------------+
|word          |total_count|capital_count|
+--------------+-----------+-------------+
|абвер         |14         |8            |
|сми           |245        |245          |
|киевской      |172        |153          |
|всесоюзном    |31         |21           |
|ленин,        |25         |25           |
|c.            |92         |76           |
|кубани        |22         |22           |
|каспийское    |43         |42           |
|мгб           |19         |19           |
|мпа           |27         |26           |
|ставропольский|17         |17           |
|мюнхенский    |12         |11           |
|альпы         |36         |36           |
|t.            |26         |21           |
|art           |13         |12           |
|ярославом     |11         |11           |
|карла,        |16         |16           |
|ганзейского   |18         |15           |
|антарктиды.   |12         |12           |
|тувалу        |99         |93           |
+----------

In [14]:
# 1. Find short abbreviations (2-3 letters + dot)
short_abbreviations = words_df.filter(
    col("word").rlike(r"^\w{2,3}\.$")
)

# 2. Count occurrences
short_abbreviations_counts = short_abbreviations.groupBy(
    col("word")
).agg(
    count("*").alias("count")
)

# 3. Show results
short_abbreviations_counts.orderBy(col("count").desc()).show(truncate=False)


+----+-----+
|word|count|
+----+-----+
|II. |390  |
|III.|165  |
|10. |72   |
|Inc.|68   |
|IV. |62   |
|11. |50   |
|000.|50   |
|12. |48   |
|16. |40   |
|100.|35   |
|17. |35   |
|15. |34   |
|20. |33   |
|13. |33   |
|IBM.|31   |
|14. |30   |
|18. |30   |
|19. |29   |
|VI. |28   |
|API.|27   |
+----+-----+
only showing top 20 rows



In [15]:
# 1. Find long abbreviations (at least one internal dot)
long_abbreviations = words_df.filter(
    col("word").rlike(r"^\w+\.\w+.*\.$")
)

# 2. Count occurrences
long_abbreviations_counts = long_abbreviations.groupBy(
    col("word")
).agg(
    count("*").alias("count")
)

# 3. Show results
long_abbreviations_counts.orderBy(col("count").desc()).show(truncate=False)


+-----------------+-----+
|word             |count|
+-----------------+-----+
|2.0.             |24   |
|1.0.             |17   |
|U.S.             |12   |
|D.S.             |12   |
|3.0.             |11   |
|1.1.             |10   |
|4.0.             |7    |
|5.0.             |7    |
|6.0.             |6    |
|3.1.             |5    |
|1.2.             |5    |
|OpenOffice.org.  |5    |
|M.D.S.           |4    |
|5.1.             |3    |
|1.5.             |3    |
|4.4BSD-Lite.     |3    |
|fedoralegacy.org.|3    |
|2.x.             |3    |
|J.P.             |3    |
|Kar.98k.         |3    |
+-----------------+-----+
only showing top 20 rows

