In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz
!tar xf spark-3.5.1-bin-hadoop3.tgz
!pip install -q findspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.1-bin-hadoop3"

In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession,functions,types,Window
spark = SparkSession.builder\
        .master("local[*]")\
        .config("spark.jars.packages", "com.databricks:spark-xml_2.12:0.14.0")\
        .getOrCreate()
spark

In [None]:
posts_data = spark.read.format('xml').option('rowTag','row').load('posts_sample.xml')

In [None]:
timed_posts=posts_data.filter(functions.year('_CreationDate').between(2010,2020)) #выберем посты из нужного временного промежутка
timed_posts.show(5)

+-----------------+------------+--------------------+-----------+-------------+-------------------+--------------------+--------------+-------+--------------------+--------------------+----------------------+-----------------+-----------------+------------+---------+-----------+------+-----+------+----------+
|_AcceptedAnswerId|_AnswerCount|               _Body|_ClosedDate|_CommentCount|_CommunityOwnedDate|       _CreationDate|_FavoriteCount|    _Id|   _LastActivityDate|       _LastEditDate|_LastEditorDisplayName|_LastEditorUserId|_OwnerDisplayName|_OwnerUserId|_ParentId|_PostTypeId|_Score|_Tags|_Title|_ViewCount|
+-----------------+------------+--------------------+-----------+-------------+-------------------+--------------------+--------------+-------+--------------------+--------------------+----------------------+-----------------+-----------------+------------+---------+-----------+------+-----+------+----------+
|             NULL|        NULL|<p>No. (And more ...|       NULL|  

In [None]:
languages = spark.read.csv('programming-languages.csv', header=True, sep=",")
languages.show(5)

+----------+--------------------+
|      name|       wikipedia_url|
+----------+--------------------+
|   A# .NET|https://en.wikipe...|
|A# (Axiom)|https://en.wikipe...|
|A-0 System|https://en.wikipe...|
|        A+|https://en.wikipe...|
|       A++|https://en.wikipe...|
+----------+--------------------+
only showing top 5 rows



In [None]:
import re
import pyspark.sql.types as ptypes

def get_language_name(string): #функция для удаления лишней информации из списка языков
  return re.sub(r' [\(\)–].*','',string)
unify_languages=functions.udf(lambda string: get_language_name(string),ptypes.StringType())
language_names = [row.name.lower() for row in languages.select(unify_languages(functions.lower("name")).alias("name")).collect()]
def get_languages_from_tags(tag_string): #функция получения языков из тегов
  if not tag_string: return[]
  arr=re.findall(r'<([^<>]*?)>',tag_string)
  return [item for item in arr if item in language_names]

languages_func=functions.udf(lambda string: get_languages_from_tags(string),ptypes.ArrayType(ptypes.StringType()))
rating_data=timed_posts.withColumn('languages',languages_func(timed_posts['_Tags'])).select(functions.year('_CreationDate').alias('year'),'languages')
rating_data=rating_data.select('year',functions.explode(rating_data['languages']).alias('language')).groupBy('year','language').count()

ws=Window.partitionBy('year').orderBy(functions.desc('count')) #популярность оценивается по количеству постов о языке за год
top_result=rating_data.withColumn('place',functions.row_number().over(ws)).filter(functions.col('place')<=10)

top_result.show()

+----+-----------+-----+-----+
|year|   language|count|place|
+----+-----------+-----+-----+
|2010|         c#|   96|    1|
|2010|       java|   52|    2|
|2010|        php|   46|    3|
|2010| javascript|   44|    4|
|2010|        c++|   28|    5|
|2010|     python|   26|    6|
|2010|objective-c|   23|    7|
|2010|          c|   20|    8|
|2010|       ruby|   12|    9|
|2010|     delphi|    8|   10|
|2011|        php|  102|    1|
|2011|         c#|  100|    2|
|2011|       java|   93|    3|
|2011| javascript|   83|    4|
|2011|        c++|   42|    5|
|2011|     python|   37|    6|
|2011|objective-c|   34|    7|
|2011|          c|   24|    8|
|2011|       ruby|   20|    9|
|2011|       perl|    9|   10|
+----+-----------+-----+-----+
only showing top 20 rows



In [None]:
top_result.write.parquet('languages_top10.parquet')

In [None]:
read_parquet = spark.read.parquet("languages_top10.parquet")

for year in range(2010, 2021):
  read_parquet.filter(functions.col("year") == year).orderBy(functions.col("count").desc()).show(truncate=False)

+----+-----------+-----+-----+
|year|language   |count|place|
+----+-----------+-----+-----+
|2010|c#         |96   |1    |
|2010|java       |52   |2    |
|2010|php        |46   |3    |
|2010|javascript |44   |4    |
|2010|c++        |28   |5    |
|2010|python     |26   |6    |
|2010|objective-c|23   |7    |
|2010|c          |20   |8    |
|2010|ruby       |12   |9    |
|2010|delphi     |8    |10   |
+----+-----------+-----+-----+

+----+-----------+-----+-----+
|year|language   |count|place|
+----+-----------+-----+-----+
|2011|php        |102  |1    |
|2011|c#         |100  |2    |
|2011|java       |93   |3    |
|2011|javascript |83   |4    |
|2011|c++        |42   |5    |
|2011|python     |37   |6    |
|2011|objective-c|34   |7    |
|2011|c          |24   |8    |
|2011|ruby       |20   |9    |
|2011|perl       |9    |10   |
+----+-----------+-----+-----+

+----+-----------+-----+-----+
|year|language   |count|place|
+----+-----------+-----+-----+
|2012|php        |154  |1    |
|2012|