In [1]:
# Инициализация контекста

from pyspark import SparkContext, SparkConf

conf = SparkConf().setAppName("L2_Apache_Spark").setMaster("local[4]") \
    .set("spark.executor.memory", "2g") \
    .set("spark.driver.memory", "2g") \
	.set("spark.python.worker.timeout", "12000")

sc = SparkContext(conf=conf)

Загрузка данных

In [13]:
import xml.etree.ElementTree as ET
from typing import NamedTuple
from datetime import datetime
import re

In [79]:
postsData = sc.textFile("posts_sample.xml")
postsCount = postsData.count();
postsXML = postsData \
  .zipWithIndex() \
  .filter(lambda t: t[1] > 1 and t[1] < postsCount - 1) \
  .map(lambda t: ET.fromstring(t[0]))

def initPosts(posts):
    class Post(NamedTuple):
        creationDate: datetime
        tags: list

    tagPattern = re.compile(r'<([^>]+)>')

    for post in posts:
        try:
            tagsString = post.get("Tags")
            tags = tagPattern.findall(tagsString) if tagsString else []
            yield Post(
                creationDate= datetime.strptime(post.get("CreationDate"), '%Y-%m-%dT%H:%M:%S.%f'),
                tags=tags)
        except:
            pass

posts = postsXML.mapPartitions(initPosts)

posts.takeSample(False, 10)

[Post(creationDate=datetime.datetime(2015, 6, 5, 11, 10, 2, 437000), tags=[]),
 Post(creationDate=datetime.datetime(2016, 6, 7, 7, 4, 23, 757000), tags=[]),
 Post(creationDate=datetime.datetime(2017, 5, 31, 16, 26, 36, 330000), tags=['html', 'css', 'google-chrome']),
 Post(creationDate=datetime.datetime(2014, 10, 20, 16, 46, 10, 690000), tags=[]),
 Post(creationDate=datetime.datetime(2010, 6, 4, 22, 13, 37, 667000), tags=[]),
 Post(creationDate=datetime.datetime(2016, 9, 16, 7, 28, 45, 427000), tags=[]),
 Post(creationDate=datetime.datetime(2013, 9, 5, 15, 10, 30, 883000), tags=['android', 'xamarin.android']),
 Post(creationDate=datetime.datetime(2016, 6, 9, 18, 44, 27, 877000), tags=[]),
 Post(creationDate=datetime.datetime(2018, 9, 4, 22, 17, 6, 277000), tags=[]),
 Post(creationDate=datetime.datetime(2010, 7, 25, 16, 14, 57, 767000), tags=[])]

In [80]:
programmingLanguagesData = sc.textFile("programming-languages.csv")
programmingLanguagesHeader = programmingLanguagesData.first()
programmingLanguages = programmingLanguagesData \
  .filter(lambda row: row != programmingLanguagesHeader) \
  .map(lambda row: row.split(",", -1)[0].lower()) \
  .collect() # языков программирования немного, можем себе позволить

programmingLanguages[:10]

['a# .net',
 'a# (axiom)',
 'a-0 system',
 'a+',
 'a++',
 'abap',
 'abc',
 'abc algol',
 'abset',
 'absys']

Объединение постов по годам

In [84]:
postsByYear = posts.keyBy(lambda post: post.creationDate.year)

Подсчет количества постов для каждого года для каждого языка

In [82]:
def seqFunc(pldict, post):
    new_dict = pldict.copy()
    for tag in post.tags:
        if tag in programmingLanguages:
            new_dict[tag] = new_dict.get(tag, 0) + 1
    return new_dict

def combFunc(lhs, rhs):
    result = {}
    all_keys = set(lhs.keys()).union(set(rhs.keys()))
    for key in all_keys:
        result[key] = lhs.get(key, 0) + rhs.get(key, 0)
    return result

programmingLanguagesCountedByYear = postsByYear \
    .aggregateByKey(
        dict(),
        seqFunc,
        combFunc
    )

Создание сводки топ-10 по годам

In [85]:
programmingLanguagesTop = programmingLanguagesCountedByYear \
  .mapValues(lambda pldict: [lang[0] for lang in sorted(pldict.items(), key=lambda t: t[1], reverse=True)[:10]]) \
  .sortByKey()

Создание файла с отчетом

In [86]:
from pyspark.sql.types import StructType, StructField, ArrayType, IntegerType, StringType
from pyspark.sql import SparkSession

ss = SparkSession.builder \
    .appName("L2_Apache_Spark") \
    .getOrCreate()

schema = StructType([
    StructField('year', IntegerType(), True),
    StructField('top_languages', ArrayType(StringType()), True)
])

langTopDF = programmingLanguagesTop.toDF(schema)
langTopDF.write.mode("overwrite").parquet("top_10_languages_by_year.parquet")

langTopDF.show(truncate=False)

+----+----------------------------------------------------------------------------+
|year|top_languages                                                               |
+----+----------------------------------------------------------------------------+
|2008|[java, ruby, javascript, c, groovy, x++, python, io, php]                   |
|2009|[java, python, php, javascript, ruby, delphi, objective-c, c, haskell, bash]|
|2010|[java, php, javascript, python, objective-c, c, ruby, delphi, bash, perl]   |
|2011|[php, java, javascript, python, objective-c, c, ruby, perl, delphi, bash]   |
|2012|[php, javascript, java, python, objective-c, ruby, c, bash, r, scala]       |
|2013|[javascript, php, java, python, objective-c, c, ruby, r, bash, scala]       |
|2014|[javascript, java, php, python, objective-c, c, r, ruby, bash, matlab]      |
|2015|[javascript, java, php, python, r, c, objective-c, ruby, matlab, scala]     |
|2016|[javascript, java, php, python, r, c, ruby, bash, scala, matlab]      

In [87]:
sc.stop()
ss.stop()