# **Préambule**

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [2]:
!wget -q https://downloads.apache.org/spark/spark-3.2.3/spark-3.2.3-bin-hadoop3.2.tgz

In [3]:
!tar xf spark-3.2.3-bin-hadoop3.2.tgz

In [4]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.3-bin-hadoop3.2"
import sys

In [5]:
!pip install pyspark==3.2.3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark==3.2.3
  Downloading pyspark-3.2.3.tar.gz (281.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.5/281.5 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.3-py2.py3-none-any.whl size=281990673 sha256=68599db498e243914be733774b14083cc6a82678a5d28dd12bf8356bbbe6346a
  Stored in directory: /root/.cache/pip/wheels/9a/99/8c/e2d5ede0e1aefb33c64af344f2cd569354237f0bdd673bd243
Successfully built pyspark
Installing collected packages: py4j

In [6]:
#import findspark
#findspark.init()
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql.functions import *

In [7]:
# Check the pyspark version
import pyspark
print(pyspark.__version__)

3.2.3


In [8]:
spark = SparkSession.builder.master("local[*]").appName("MyFirstProgram").getOrCreate()
sc=spark.sparkContext

# Test the spark
df = spark.createDataFrame([{"hello": "world"} for x in range(1000)])
df.show(3, False)

+-----+
|hello|
+-----+
|world|
|world|
|world|
+-----+
only showing top 3 rows





---


# **Import**

In [9]:
# import for exercise 0

import zipfile
from google.colab import drive
import time

# import for exercise 1

from pyspark import SparkFiles
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql import Row
import re
import string

# import for exercise 2

from pyspark.sql.types import *



---


# **Exercise n°0: Download and Prepare your Log File**

In [10]:
drive.mount('/content/drive')
!apt install unzip
!unzip -u "/content/drive/MyDrive/Colab_Notebooks/Big_Data/TP2/pagecounts-20160101-000000_parsed.out.zip" -d "/content/drive/MyDrive/Colab_Notebooks/Big_Data/TP2/"

Mounted at /content/drive
Reading package lists... Done
Building dependency tree       
Reading state information... Done
unzip is already the newest version (6.0-25ubuntu1.1).
The following package was automatically installed and is no longer required:
  libnvidia-common-510
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 21 not upgraded.
Archive:  /content/drive/MyDrive/Colab_Notebooks/Big_Data/TP2/pagecounts-20160101-000000_parsed.out.zip


In [11]:
wiki_url = "/content/drive/MyDrive/Colab_Notebooks/Big_Data/TP2/pagecounts-20160101-000000_parsed.out"
spark.sparkContext.addFile(wiki_url)
wiki_RDD_v1 = sc.textFile(SparkFiles.get("pagecounts-20160101-000000_parsed.out"))

In [12]:
wiki_RDD_v1.take(10)

['aa 271_a.C 1 4675',
 'aa Category:User_th 1 4770',
 'aa Chiron_Elias_Krase 1 4694',
 'aa Dassault_rafaele 2 9372',
 'aa E.Desv 1 4662',
 'aa File:Wiktionary-logo-en.png 1 10752',
 'aa Indonesian_Wikipedia 1 4679',
 'aa Main_Page 5 266946',
 'aa Requests_for_new_languages/Wikipedia_Banyumasan 1 4733',
 'aa Special:Contributions/203.144.160.245 1 5812']

In [13]:
def separe_elements(ligne):
    elements = ligne.split(" ")
    return (elements[0], elements[1], int(elements[2]), int(elements[3]))

In [14]:
wiki_RDD = wiki_RDD_v1.map(lambda word : separe_elements(word))

In [15]:
wiki_RDD.take(10)

[('aa', '271_a.C', 1, 4675),
 ('aa', 'Category:User_th', 1, 4770),
 ('aa', 'Chiron_Elias_Krase', 1, 4694),
 ('aa', 'Dassault_rafaele', 2, 9372),
 ('aa', 'E.Desv', 1, 4662),
 ('aa', 'File:Wiktionary-logo-en.png', 1, 10752),
 ('aa', 'Indonesian_Wikipedia', 1, 4679),
 ('aa', 'Main_Page', 5, 266946),
 ('aa', 'Requests_for_new_languages/Wikipedia_Banyumasan', 1, 4733),
 ('aa', 'Special:Contributions/203.144.160.245', 1, 5812)]

In [16]:
columns = ["Project", "Page_title","Page_hits","Page_size"]
wiki_DF = spark.createDataFrame(wiki_RDD,columns)  

In [17]:
wiki_DF.show()

+-------+--------------------+---------+---------+
|Project|          Page_title|Page_hits|Page_size|
+-------+--------------------+---------+---------+
|     aa|             271_a.C|        1|     4675|
|     aa|    Category:User_th|        1|     4770|
|     aa|  Chiron_Elias_Krase|        1|     4694|
|     aa|    Dassault_rafaele|        2|     9372|
|     aa|              E.Desv|        1|     4662|
|     aa|File:Wiktionary-l...|        1|    10752|
|     aa|Indonesian_Wikipedia|        1|     4679|
|     aa|           Main_Page|        5|   266946|
|     aa|Requests_for_new_...|        1|     4733|
|     aa|Special:Contribut...|        1|     5812|
|     aa|Special:Contribut...|        1|     5805|
|     aa|Special:Contribut...|        1|     5808|
|     aa|Special:Contribut...|        1|     5812|
|     aa|Special:ListFiles...|        1|     5035|
|     aa|Special:ListFiles...|        1|     5036|
|     aa|Special:ListFiles...|        1|     5032|
|     aa|Special:Log/Md._F...| 

---
# **Exercise n°1 : Explore Web Logs with Spark RDDs**


In [18]:
Log = StructType([
    StructField("project", StringType(), True),
    StructField("title", StringType(), True),
    StructField("hits", IntegerType(), True),
    StructField("size", IntegerType(), True)
])

In [19]:
def separe_elements(ligne):
    elements = ligne.split(" ")
    LogEx = Row(project=elements[0], title=elements[1], hits=int(elements[2]), size=int(elements[3]))
    return LogEx

In [21]:
start = time.time()

log_RDD = wiki_RDD_v1.map(lambda word : separe_elements(word))

log_RDD.take(10)

log_RDD_Thirdline = log_RDD.take(3)[-1]

print(log_RDD_Thirdline.project)
print(log_RDD_Thirdline.title)
print(log_RDD_Thirdline.hits)
print(log_RDD_Thirdline.size)

end = time.time()

print("The execution time for this question is " + str(end-start) + " seconds.")

aa
Chiron_Elias_Krase
1
4694
The execution time for this question is 0.8432097434997559 seconds.


# **Question n°1 :**

In [22]:
def print_record(rdd, number):
  myLine = rdd.take(number)[-1]
  print("Project code: " + myLine.project + "\t Page: " + myLine.title + "\t Page hits: " + str(myLine.hits) + "\t Page size: " + str(myLine.size))

In [23]:
start = time.time()

for i in range(1,21):
  print_record(log_RDD, i)

end = time.time()

print("\nThe execution time for this question is " + str(end-start) + " seconds.")

Project code: aa	 Page: 271_a.C	 Page hits: 1	 Page size: 4675
Project code: aa	 Page: Category:User_th	 Page hits: 1	 Page size: 4770
Project code: aa	 Page: Chiron_Elias_Krase	 Page hits: 1	 Page size: 4694
Project code: aa	 Page: Dassault_rafaele	 Page hits: 2	 Page size: 9372
Project code: aa	 Page: E.Desv	 Page hits: 1	 Page size: 4662
Project code: aa	 Page: File:Wiktionary-logo-en.png	 Page hits: 1	 Page size: 10752
Project code: aa	 Page: Indonesian_Wikipedia	 Page hits: 1	 Page size: 4679
Project code: aa	 Page: Main_Page	 Page hits: 5	 Page size: 266946
Project code: aa	 Page: Requests_for_new_languages/Wikipedia_Banyumasan	 Page hits: 1	 Page size: 4733
Project code: aa	 Page: Special:Contributions/203.144.160.245	 Page hits: 1	 Page size: 5812
Project code: aa	 Page: Special:Contributions/5.232.61.79	 Page hits: 1	 Page size: 5805
Project code: aa	 Page: Special:Contributions/Ayarportugal	 Page hits: 1	 Page size: 5808
Project code: aa	 Page: Special:Contributions/Born2bgra

# **Question n°2**

In [24]:
start = time.time()

log_RDD_Count = log_RDD.count()
print("In this rdd, there is " + str(log_RDD_Count) + " records.")

end = time.time()

print("The execution time for this question is " + str(end-start) + " seconds.")

In this rdd, there is 3324129 records.
The execution time for this question is 12.679304122924805 seconds.


# **Question n°3**

In [25]:
start = time.time()

log_RDD_Max = log_RDD.map(lambda x : x[3]).max()
log_RDD_Min = log_RDD.map(lambda x : x[3]).min()
log_RDD_Mean = (log_RDD.map(lambda x : x[3]).sum())/log_RDD_Count
log_RDD_MaxHits = log_RDD.map(lambda x : x[2]).max()
print("About the page size, we know that :")
print("The max is : " + str(log_RDD_Max) + ", the min is : " + str(log_RDD_Min) + " and the mean is : " + str(log_RDD_Mean))

end = time.time()

print("\nThe execution time for this question is " + str(end-start) + " seconds.")

About the page size, we know that :
The max is : 141180155987, the min is : 0 and the mean is : 132239.56957446598

The execution time for this question is 60.39345073699951 seconds.


# **Question n°4**

In [27]:
def keepMax(x):
  if x[3]==log_RDD_Max:
    return x

In [28]:
start = time.time()

log_RDD_MaxPages = log_RDD.filter(lambda x : keepMax(x))
log_RDD_MaxPages.take(10)

end = time.time()

print("\nThe execution time for this question is " + str(end-start) + " seconds.")


The execution time for this question is 14.30329704284668 seconds.


**Bonus**

In [29]:
def keepMin(x):
  if x[3]==log_RDD_Min:
    return x

In [30]:
start = time.time()

log_RDD_MinPages = log_RDD.filter(lambda x : keepMin(x))
log_RDD_MinPages.take(10)

end = time.time()

print("\nThe execution time for this question is " + str(end-start) + " seconds.")


The execution time for this question is 0.6598896980285645 seconds.


# **Question n°5**

In [31]:
def keepMaxHits(x):
  if x[2]==log_RDD_MaxHits:
    return x

In [32]:
start = time.time()

print("Number of hits of the biggest page : " + str(log_RDD_MaxPages.first().hits))

log_RDD_MaxHitsDone = log_RDD.filter(lambda x : keepMaxHits(x))
log_RDD_MaxHitsDone.take(10)

end = time.time()

print("\nThe execution time for this question is " + str(end-start) + " seconds.")

Number of hits of the biggest page :5466346

The execution time for this question is 26.452588319778442 seconds.


# **Question n°6**

In [33]:
def keepSupAverage(x):
  if x[3]>log_RDD_Mean:
    return x

In [36]:
start = time.time()

log_RDD_MaxPages = log_RDD.filter(lambda x : keepSupAverage(x))
log_RDD_MaxPages.take(10)

end = time.time()

#print("\nThe execution time for this question is " + str(end-start) + " seconds.")

[Row(project='aa', title='Main_Page', hits=5, size=266946),
 Row(project='ace.mw', title='ace', hits=31, size=827168),
 Row(project='af', title='1859', hits=4, size=219540),
 Row(project='af', title='18_Oktober', hits=4, size=264724),
 Row(project='af', title='1941', hits=4, size=256344),
 Row(project='af', title='2016', hits=5, size=215498),
 Row(project='af', title='4_Januarie', hits=4, size=268828),
 Row(project='af', title='Afrika-unie', hits=1, size=172078),
 Row(project='af', title='Big_Ben', hits=13, size=136201),
 Row(project='af', title='Comrades-maraton', hits=1, size=155180)]

# **Question n°7**

In [37]:
start = time.time()

log_RDD_HitsPerProject = log_RDD.map(lambda x : (x[0], x[2])).reduceByKey(lambda x, y : x+y).sortBy(lambda row : row[1], ascending=False)

log_RDD_HitsPerProject.take(10)

log_RDD_HitsPerPage = log_RDD.map(lambda x : (x[1], x[2])).reduceByKey(lambda x, y : x+y).sortBy(lambda row : row[1], ascending=False)

log_RDD_HitsPerPage.take(5)

end = time.time()

print("\nThe execution time for this question is " + str(end-start) + " seconds.")


The execution time for this question is 59.74118137359619 seconds.


# **Question n°8**

In [38]:
##### Create the cleaning function

def cleaningFunc(text):
  text = text.lower()
  translator = text.maketrans(string.punctuation,"_" * len(string.punctuation))
  text = text.translate(translator)
  text = text.split("_")
  return text

In [39]:
tst = "Requests_for.new:languages/Wikipedia-Banyumasan"
print(cleaningFunc(tst))

['requests', 'for', 'new', 'languages', 'wikipedia', 'banyumasan']


In [40]:
start = time.time()

log_RDD_PageName2 = log_RDD.flatMap(lambda x : cleaningFunc(x[1])) #unfold the table

log_RDD_PageNameWordOnly = log_RDD_PageName2.filter(lambda x: x.isalpha()).map(lambda x : (x, 1))

log_RDD_PageNameWithOccurences = log_RDD_PageNameWordOnly.reduceByKey(lambda x, y : x+y)

log_RDD_Page_UniqueWord = log_RDD_PageNameWithOccurences.filter(lambda x : x[1]==1).map(lambda x : x[0])

log_RDD_Page_UniqueWord.take(10)

end = time.time()

print("\nThe execution time for this question is " + str(end-start) + " seconds.")


The execution time for this question is 40.317625284194946 seconds.


# **Question n°9**

In [41]:
start = time.time()

log_TitleName_Ordered = log_RDD_PageNameWithOccurences.sortBy(lambda row : row[1], ascending=False)

log_TitleName_Ordered.first()

end = time.time()

print("\nThe execution time for this question is " + str(end-start) + " seconds.")


The execution time for this question is 5.237361669540405 seconds.


---
# **Exercise n°2: Query Web Logs with Spark SQL**





In [None]:
wiki_DF.show()

+-------+--------------------+---------+---------+
|Project|          Page_title|Page_hits|Page_size|
+-------+--------------------+---------+---------+
|     aa|             271_a.C|        1|     4675|
|     aa|    Category:User_th|        1|     4770|
|     aa|  Chiron_Elias_Krase|        1|     4694|
|     aa|    Dassault_rafaele|        2|     9372|
|     aa|              E.Desv|        1|     4662|
|     aa|File:Wiktionary-l...|        1|    10752|
|     aa|Indonesian_Wikipedia|        1|     4679|
|     aa|           Main_Page|        5|   266946|
|     aa|Requests_for_new_...|        1|     4733|
|     aa|Special:Contribut...|        1|     5812|
|     aa|Special:Contribut...|        1|     5805|
|     aa|Special:Contribut...|        1|     5808|
|     aa|Special:Contribut...|        1|     5812|
|     aa|Special:ListFiles...|        1|     5035|
|     aa|Special:ListFiles...|        1|     5036|
|     aa|Special:ListFiles...|        1|     5032|
|     aa|Special:Log/Md._F...| 

# **Question n°3 bis**

In [42]:
start = time.time()

wiki_DF.select(max("Page_size")).show()
wiki_DF.select(min("Page_size")).show()
wiki_DF.select(mean("Page_size")).show()

end = time.time()

print("\nThe execution time for this question is " + str(end-start) + " seconds.")

+--------------+
|max(Page_size)|
+--------------+
|  141180155987|
+--------------+

+--------------+
|min(Page_size)|
+--------------+
|             0|
+--------------+

+------------------+
|    avg(Page_size)|
+------------------+
|132239.56957446598|
+------------------+


The execution time for this question is 44.020508766174316 seconds.


# **Question n°5 bis**

In [44]:
maxDF = wiki_DF.select(max("Page_size")).collect()[0][0]
minDF = wiki_DF.select(min("Page_size")).collect()[0][0]
meanDF = wiki_DF.select(mean("Page_size")).collect()[0][0]

In [45]:
start = time.time()

wiki_DF_MaxPages = wiki_DF.filter(wiki_DF.Page_size==maxDF)
wiki_DF_MaxPages.show()

maxHitsDF = wiki_DF.select(max("Page_hits")).collect()[0][0]

print(maxHitsDF)

wiki_DF_MaxHits = wiki_DF.filter(wiki_DF.Page_hits==maxHitsDF)

wiki_DF_MaxHits.show()

end = time.time()

print("\nThe execution time for this question is " + str(end-start) + " seconds.")

+-------+----------+---------+------------+
|Project|Page_title|Page_hits|   Page_size|
+-------+----------+---------+------------+
|  en.mw|        en|  5466346|141180155987|
+-------+----------+---------+------------+

5466346
+-------+----------+---------+------------+
|Project|Page_title|Page_hits|   Page_size|
+-------+----------+---------+------------+
|  en.mw|        en|  5466346|141180155987|
+-------+----------+---------+------------+


The execution time for this question is 41.051405906677246 seconds.


**Bonus**

In [46]:
start = time.time()

wiki_DF_MinPages = wiki_DF.filter(wiki_DF.Page_size==minDF)

wiki_DF_MinPages.show()

end = time.time()

print("\nThe execution time for this question is " + str(end-start) + " seconds.")

+-------+--------------------+---------+---------+
|Project|          Page_title|Page_hits|Page_size|
+-------+--------------------+---------+---------+
|     af|                1337|        1|        0|
|     af|                1433|        1|        0|
|     af|                1498|        1|        0|
|     af|                1577|        1|        0|
|     af|                1864|        1|        0|
|     af|                 689|        1|        0|
|     af|    Clifton-hangbrug|        1|        0|
|     af|      Die_Transvaler|        1|        0|
|     af|     Griekse_oergode|        1|        0|
|     af| Kategorie:22ste_eeu|        1|        0|
|     af|Kategorie:Geograf...|        2|        0|
|     af|Kategorie:Middeld...|        1|        0|
|     af|  Kategorie:Politiek|        2|        0|
|     af|             Lapland|        2|        0|
|     af|             Oktober|        1|        0|
|     af|Sjabloon:Susterpr...|        2|        0|
|     af|             Skrywer| 

# **Question n°6 bis**

In [47]:
start = time.time()

wiki_DF_AboveAverage = wiki_DF.filter(wiki_DF.Page_size>meanDF)
wiki_DF_AboveAverage.show(10)

end = time.time()

print("\nThe execution time for this question is " + str(end-start) + " seconds.")

+-------+----------------+---------+---------+
|Project|      Page_title|Page_hits|Page_size|
+-------+----------------+---------+---------+
|     aa|       Main_Page|        5|   266946|
| ace.mw|             ace|       31|   827168|
|     af|            1859|        4|   219540|
|     af|      18_Oktober|        4|   264724|
|     af|            1941|        4|   256344|
|     af|            2016|        5|   215498|
|     af|      4_Januarie|        4|   268828|
|     af|     Afrika-unie|        1|   172078|
|     af|         Big_Ben|       13|   136201|
|     af|Comrades-maraton|        1|   155180|
+-------+----------------+---------+---------+
only showing top 10 rows


The execution time for this question is 4.191070556640625 seconds.


# **Question n°8 bis**

In [48]:
start = time.time()

# define with udf a function applicable on a dataframe
clean_udf = udf(cleaningFunc, ArrayType(StringType()))

# we apply our new cleaning function to the title_page column
cleaned_DF = wiki_DF.withColumn("words", clean_udf(wiki_DF["Page_title"]))

# explode: equivalent to flatMap but for dataframe
exploded_DF = cleaned_DF.select(explode(cleaned_DF["words"]).alias("word"))

# we filter to keep only lowercase words
filtered_DF = exploded_DF.filter(regexp_extract(exploded_DF["word"], r"[a-z]+", 0) != "")

# only words that appear once are kept
unique_words_DF = filtered_DF.select("word").distinct()

# on affiche
unique_words_DF.show()

end = time.time()

print("\nThe execution time for this question is " + str(end-start) + " seconds.")

+----------+
|      word|
+----------+
|    trotus|
|     oscar|
|    harder|
|    morant|
|  cheyenne|
|    petrie|
|     pools|
|  guernsey|
|   serebro|
|     fijru|
|    toegra|
|     turks|
|    welkom|
|       art|
|  heinrich|
|      elsa|
|     monte|
|occidental|
|   familia|
|shevchenko|
+----------+
only showing top 20 rows


The execution time for this question is 68.49304986000061 seconds.


# **Bonus : Question n°9 bis**

In [49]:
start=time.time()

# count() creates a new "count" column with the number of repetitions of the words in the "word" column
count_DF = filtered_DF.groupBy("word").count()

# we order in descending order of the count and display the first line
most_frequent_word = count_DF.orderBy(desc("count")).first()

print(most_frequent_word)

end = time.time()

print("\nThe execution time for this question is " + str(end-start) + " seconds.")

Row(word='special', count=253531)

The execution time for this question is 66.9534637928009 seconds.
