In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [61]:
import findspark

findspark.init()

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import pyspark.sql as sql
from pyspark.sql.types import DoubleType, IntegerType, ArrayType, StringType
from pyspark.sql.functions import udf, col, max, sum, countDistinct, explode, size
from math import sin, cos, sqrt, atan2, radians
import re

spark = SparkSession.builder.master("local[*]").config("spark.jars.packages", "com.databricks:spark-xml_2.12:0.13.0").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better
spark

In [57]:
pl_df = spark.read.csv('programming-languages.csv', header=True, inferSchema=True, sep=",")
pl_df.show(5)
pl_df.printSchema()

langs = [str(t['name']).lower() for t in pl_df.select('name').collect()]
print(langs)

+----------+--------------------+
|      name|       wikipedia_url|
+----------+--------------------+
|   A# .NET|https://en.wikipe...|
|A# (Axiom)|https://en.wikipe...|
|A-0 System|https://en.wikipe...|
|        A+|https://en.wikipe...|
|       A++|https://en.wikipe...|
+----------+--------------------+
only showing top 5 rows

root
 |-- name: string (nullable = true)
 |-- wikipedia_url: string (nullable = true)

['a# .net', 'a# (axiom)', 'a-0 system', 'a+', 'a++', 'abap', 'abc', 'abc algol', 'abset', 'absys', 'acc', 'accent', 'ace dasl', 'acl2', 'act-iii', 'action!', 'actionscript', 'ada', 'adenine', 'agda', 'agilent vee', 'agora', 'aimms', 'alef', 'alf', 'algol 58', 'algol 60', 'algol 68', 'algol w', 'alice', 'alma-0', 'ambienttalk', 'amiga e', 'amos', 'ampl', 'apex (salesforce.com)', 'apl', "app inventor for android's visual block language", 'applescript', 'arc', 'arexx', 'argus', 'aspectj', 'assembly language', 'ats', 'ateji px', 'autohotkey', 'autocoder', 'autoit', 'autolisp / vi

In [25]:
posts_df = spark.read.format('xml').options(rowTag="row").load('posts_sample.xml')
posts_df.show(5)
posts_df.printSchema()

+-----------------+------------+--------------------+-----------+-------------+--------------------+--------------------+--------------+---+--------------------+--------------------+----------------------+-----------------+-----------------+------------+---------+-----------+------+--------------------+--------------------+----------+
|_AcceptedAnswerId|_AnswerCount|               _Body|_ClosedDate|_CommentCount| _CommunityOwnedDate|       _CreationDate|_FavoriteCount|_Id|   _LastActivityDate|       _LastEditDate|_LastEditorDisplayName|_LastEditorUserId|_OwnerDisplayName|_OwnerUserId|_ParentId|_PostTypeId|_Score|               _Tags|              _Title|_ViewCount|
+-----------------+------------+--------------------+-----------+-------------+--------------------+--------------------+--------------+---+--------------------+--------------------+----------------------+-----------------+-----------------+------------+---------+-----------+------+--------------------+--------------------+-

# Задание
Сформировать отчёт с информацией о 10 наиболее популярных языках программирования по итогам года за период с 2010 по 2020 годы. Отчёт будет отражать динамику изменения популярности языков программирования и представлять собой набор таблиц "топ-10" для каждого года.

Получившийся отчёт сохранить в формате Apache Parquet.

Для выполнения задания вы можете использовать любую комбинацию Spark API: RDD API, Dataset API, SQL API.

In [62]:
def get_tags(tags_string):
    if not tags_string:
        return []
    pattern = r'<([^>]+)>'
    tags = re.findall(pattern, tags_string)
    res = []
    for tag in tags:
        if tag.lower() in langs:
            res.append(tag.lower())
    return res

def get_year(date_and_time):
    return date_and_time.year

get_tags_udf = udf(get_tags, ArrayType(StringType()))
get_year_udf = udf(get_year, IntegerType())

posts_data_simplified = posts_df \
    .withColumn("tags", get_tags_udf(posts_df["_Tags"])) \
    .withColumn("year", get_year_udf(posts_df["_LastActivityDate"])) \
    .select("tags", "year", col("_ViewCount").alias("views")) \
    .filter(size("tags") > 0)

first_rows = posts_data_simplified.collect()

for i, row in enumerate(first_rows):
    print(i + 1, row)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
2819 Row(tags=['java', 'awk'], year=2013, views=1430)
2820 Row(tags=['r'], year=2013, views=149)
2821 Row(tags=['javascript', 'php'], year=2013, views=258)
2822 Row(tags=['powershell'], year=2013, views=1483)
2823 Row(tags=['java'], year=2013, views=554)
2824 Row(tags=['objective-c'], year=2014, views=19)
2825 Row(tags=['bash', 'perl'], year=2014, views=101)
2826 Row(tags=['php'], year=2014, views=38)
2827 Row(tags=['java'], year=2016, views=6562)
2828 Row(tags=['java'], year=2018, views=1857)
2829 Row(tags=['javascript'], year=2014, views=63)
2830 Row(tags=['php'], year=2017, views=34)
2831 Row(tags=['java'], year=2014, views=886)
2832 Row(tags=['php'], year=2015, views=48)
2833 Row(tags=['python'], year=2019, views=3633)
2834 Row(tags=['r'], year=2014, views=550)
2835 Row(tags=['php'], year=2014, views=678)
2836 Row(tags=['java'], year=2015, views=2265)
2837 Row(tags=['php'], year=2014, views=93)
2838 Row(tags=['java'],

In [79]:
results = {}

for i in range(2010, 2021):
    posts_data_sorted = posts_data_simplified \
    .select("year", explode("tags").alias("tag"), "views") \
    .filter(f"year = {i}") \
    .groupBy("year", "tag") \
    .agg(sum("views").alias("total_views")) \
    .orderBy(col('total_views').desc())
    results[i] = posts_data_sorted

In [80]:
for year in results.keys():
    results[year].write.format("parquet").save(f"top_{year}")

In [83]:
print(results[2010])
print(results[2019])

+----+------------+-----------+
|year|         tag|total_views|
+----+------------+-----------+
|2010|        java|      53333|
|2010|      matlab|      51865|
|2010| objective-c|      43878|
|2010|         php|      39730|
|2010|  javascript|      37059|
|2010|      python|      25930|
|2010|        ruby|      15864|
|2010|           c|      13810|
|2010|      delphi|       7680|
|2010|           r|       7499|
|2010|       xpath|       5122|
|2010|actionscript|       5104|
|2010|         sed|       4573|
|2010|        perl|       3515|
|2010|     haskell|       2601|
|2010|        bash|       2171|
|2010|       mouse|       1789|
|2010|      racket|       1539|
|2010|      scheme|       1539|
|2010| applescript|       1462|
+----+------------+-----------+
only showing top 20 rows

+----+-----------+-----------+
|year|        tag|total_views|
+----+-----------+-----------+
|2019|     python|    3408726|
|2019|        php|    1513426|
|2019| javascript|     877473|
|2019|       java|  