In [8]:
import os
import sys
import pyspark.sql.functions as F
from pyspark.sql import Row
from pyspark.sql import SparkSession

In [9]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.12:0.17.0 pyspark-shell'
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark

In [10]:
postsData = spark.read.format('xml').option('rowTag', 'row').option("timestampFormat", 'y/M/d H:m:s').load('posts_sample.xml')

print(f"Elems: {postsData.count()}")
postsData.printSchema()
postsData.show()

dates = ("2010-01-01",  "2020-12-31")
posts_by_date = postsData.filter(F.col("_CreationDate").between(*dates))
posts_by_date.show()

Elems: 46006
root
 |-- _AcceptedAnswerId: long (nullable = true)
 |-- _AnswerCount: long (nullable = true)
 |-- _Body: string (nullable = true)
 |-- _ClosedDate: timestamp (nullable = true)
 |-- _CommentCount: long (nullable = true)
 |-- _CommunityOwnedDate: timestamp (nullable = true)
 |-- _CreationDate: timestamp (nullable = true)
 |-- _FavoriteCount: long (nullable = true)
 |-- _Id: long (nullable = true)
 |-- _LastActivityDate: timestamp (nullable = true)
 |-- _LastEditDate: timestamp (nullable = true)
 |-- _LastEditorDisplayName: string (nullable = true)
 |-- _LastEditorUserId: long (nullable = true)
 |-- _OwnerDisplayName: string (nullable = true)
 |-- _OwnerUserId: long (nullable = true)
 |-- _ParentId: long (nullable = true)
 |-- _PostTypeId: long (nullable = true)
 |-- _Score: long (nullable = true)
 |-- _Tags: string (nullable = true)
 |-- _Title: string (nullable = true)
 |-- _ViewCount: long (nullable = true)

+-----------------+------------+--------------------+-----------

In [11]:
languagesData = spark.read.format("csv").option("header", True).option("inferSchema", True).load("programming-languages.csv").dropna(how="all")


In [12]:
print(f"\nLanguages_count: {languagesData.count()}")
print(f"\nFirst_lang:")
languagesData.show()
languagesData.printSchema()


Languages_count: 700

First_lang:
+------------+--------------------+
|        name|       wikipedia_url|
+------------+--------------------+
|     A# .NET|https://en.wikipe...|
|  A# (Axiom)|https://en.wikipe...|
|  A-0 System|https://en.wikipe...|
|          A+|https://en.wikipe...|
|         A++|https://en.wikipe...|
|        ABAP|https://en.wikipe...|
|         ABC|https://en.wikipe...|
|   ABC ALGOL|https://en.wikipe...|
|       ABSET|https://en.wikipe...|
|       ABSYS|https://en.wikipe...|
|         ACC|https://en.wikipe...|
|      Accent|https://en.wikipe...|
|    Ace DASL|https://en.wikipe...|
|        ACL2|https://en.wikipe...|
|     ACT-III|https://en.wikipe...|
|     Action!|https://en.wikipe...|
|ActionScript|https://en.wikipe...|
|         Ada|https://en.wikipe...|
|     Adenine|https://en.wikipe...|
|        Agda|https://en.wikipe...|
+------------+--------------------+
only showing top 20 rows

root
 |-- name: string (nullable = true)
 |-- wikipedia_url: string (nullab

Сформировать отчёт с информацией о 10 наиболее популярных языках программирования по итогам года за период с 2010 по 2020 годы. Отчёт будет отражать динамику изменения популярности языков программирования и представлять собой набор таблиц "топ-10" для каждого года.

In [18]:
def find_name(row, language_name):
    tag = next((name for name in language_name if name.lower() in str(row._Tags).lower()), 'None')
    return (row._CreationDate, tag)

languages_names=[str(n[0]) for n in languagesData.collect()]

languages_by_year = (posts_by_date.rdd.map(lambda x: find_name(x, languages_names))
                     .filter(lambda x: x[1] != 'None')
                     .keyBy(lambda row: (row[0].year, row[1]))
                     .aggregateByKey(0, lambda x, y: x + 1, lambda x1, x2: x1 + x2)
                     .sortBy(lambda x: x[1], ascending=False)
).collect()


list_by_years = []
for year in range(2010,2021):
    list_by_years.extend([(row[0][0], row[0][1], row[1]) for row in languages_by_year if row[0][0] == year][:10])


row_template = Row('Year', 'Language', 'Count')
df = spark.createDataFrame([row_template(*x) for x in list_by_years])
df.write.mode("overwrite").parquet("top_10_languages_by_years.parquet")
df.show()    

+----+--------+-----+
|Year|Language|Count|
+----+--------+-----+
|2010|       E| 1510|
|2010|       C|  296|
|2010|       B|  210|
|2010|       D|   65|
|2010|       L|   14|
|2010|     Arc|   10|
|2010|     ACC|    9|
|2010|       G|    9|
|2010|       J|    8|
|2010|     PHP|    7|
|2011|       E| 2497|
|2011|       C|  466|
|2011|       B|  336|
|2011|       D|  127|
|2011|       L|   29|
|2011|       G|   15|
|2011|       J|   14|
|2011|     PHP|   14|
|2011|     Arc|   10|
|2011|       F|    9|
+----+--------+-----+
only showing top 20 rows

