In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 45 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 48.7 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=634075d5f611a5a48ff92f002b27ba37af2d226a807de652e8aef1c96c980876
  Stored in directory: /root/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.0


In [2]:
import pyspark as ps
import random

In [3]:
sc = ps.SparkContext()
sc

In [4]:
# RDD lambda
%%time
flips = 10000000
coins = range(flips)

heads = sc.parallelize(coins) \
          .map(lambda i: random.random()) \
          .filter(lambda r: r<0.50) \
          .count()

print(heads) 

4997103
CPU times: user 93.3 ms, sys: 9.55 ms, total: 103 ms
Wall time: 9.91 s


In [5]:
# RDD Operations
pets = sc.parallelize([("cat",1),("dog",1),("cat",2)])

print(pets.reduceByKey(lambda x, y: x + y).take(10))

print(pets.groupByKey().take(10))

print(pets.sortByKey().take(10))

[('cat', 3), ('dog', 1)]
[('cat', <pyspark.resultiterable.ResultIterable object at 0x7fcf6b27b390>), ('dog', <pyspark.resultiterable.ResultIterable object at 0x7fcf6b281290>)]
[('cat', 1), ('cat', 2), ('dog', 1)]


In [None]:
# Download posts data
!wget -nc -P code/data/ https://archive.org/download/stackexchange/ai.stackexchange.com.7z/Posts.xml

File ‘code/data/Posts.xml’ already there; not retrieving.



In [None]:
# Load file into RDD
posts = sc.textFile("code/data/Posts.xml")
posts.take(3)

['<?xml version="1.0" encoding="utf-8"?>',
 '<posts>',
 '  <row Id="1" PostTypeId="1" AcceptedAnswerId="3" CreationDate="2016-08-02T15:39:14.947" Score="10" ViewCount="659" Body="&lt;p&gt;What does &quot;backprop&quot; mean? Is the &quot;backprop&quot; term basically the same as &quot;backpropagation&quot; or does it have a different meaning?&lt;/p&gt;&#xA;" OwnerUserId="8" LastEditorUserId="2444" LastEditDate="2019-11-16T17:56:22.093" LastActivityDate="2021-07-08T10:45:23.250" Title="What is &quot;backprop&quot;?" Tags="&lt;neural-networks&gt;&lt;backpropagation&gt;&lt;terminology&gt;&lt;definitions&gt;" AnswerCount="5" CommentCount="0" FavoriteCount="1" ContentLicense="CC BY-SA 4.0" />']

In [None]:
# Parse XML element
import xml.etree.ElementTree as etree
element = etree.fromstring(posts.take(3)[2])
element.items()

[('Id', '1'),
 ('PostTypeId', '1'),
 ('AcceptedAnswerId', '3'),
 ('CreationDate', '2016-08-02T15:39:14.947'),
 ('Score', '10'),
 ('ViewCount', '659'),
 ('Body',
  '<p>What does "backprop" mean? Is the "backprop" term basically the same as "backpropagation" or does it have a different meaning?</p>\n'),
 ('OwnerUserId', '8'),
 ('LastEditorUserId', '2444'),
 ('LastEditDate', '2019-11-16T17:56:22.093'),
 ('LastActivityDate', '2021-07-08T10:45:23.250'),
 ('Title', 'What is "backprop"?'),
 ('Tags', '<neural-networks><backpropagation><terminology><definitions>'),
 ('AnswerCount', '5'),
 ('CommentCount', '0'),
 ('FavoriteCount', '1'),
 ('ContentLicense', 'CC BY-SA 4.0')]

In [None]:
posts.count()

21604