In [None]:
# устанавим pyspark и добавим необходимые библиотеки
!pip install pyspark



In [None]:
# Импортирую библиотеки
import pyspark
import os
from datetime import datetime
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import StructType, StructField, StringType, DateType

In [None]:
# Добавим гугд диск
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Создание точки конфигурации
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.12:0.13.0 pyspark-shell'
sc = SparkSession.builder.appName("L2").master("local[*]").getOrCreate()
# Проверка установки
sc

In [None]:
programming_languages_list = sc.read.csv("/content/drive/MyDrive/BigData/My_work/LR2/Data/programming-languages.csv")
posts_sample = sc.read.format("xml").options(rowTag="row").load('/content/drive/MyDrive/BigData/My_work/LR2/Data/posts_sample.xml')

In [None]:
programming_languages = [str(x[0]) for x in programming_languages_list.collect()]
programming_languages.pop(0) # Удалил первый элемент: name
programming_languages[:5]

['A# .NET', 'A# (Axiom)', 'A-0 System', 'A+', 'A++']

In [None]:
def detectionProgLanguage(x):
    tag = next((language for language in programming_languages if "<" + language.lower() + ">" in x._Tags.lower()), None)
    return None if tag is None else (x._Id, tag)

In [None]:
def is_year_date(x, year):
    start = datetime(year=year, month=1, day=1)
    end = datetime(year=year, month=12, day=31)
    return x._CreationDate >= start and x._CreationDate <= end

In [None]:
pl_by_year = {}

for year in range(2010, 2020):
    pl_by_year[year] = posts_sample.rdd \
        .filter(lambda x: x._Tags is not None and is_year_date(x, year)) \
        .map(detectionProgLanguage) \
        .filter(lambda x: x is not None) \
        .keyBy(lambda x: x[1]) \
        .aggregateByKey(0, lambda x, y: x + 1, lambda x, y: x + y) \
        .sortBy(lambda x: x[1], ascending=False) \
        .toDF()

    pl_by_year[year] = pl_by_year[year] \
                        .select(col("_1").alias("PL"), col("_2").alias(f"Mentioned_in_{year}")) \
                        .limit(10)

    pl_by_year[year].show()

+-----------+-----------------+
|         PL|Mentioned_in_2010|
+-----------+-----------------+
|       Java|               52|
| JavaScript|               44|
|        PHP|               42|
|     Python|               25|
|Objective-C|               22|
|          C|               20|
|       Ruby|               11|
|     Delphi|                7|
|          R|                3|
|       Bash|                3|
+-----------+-----------------+

+-----------+-----------------+
|         PL|Mentioned_in_2011|
+-----------+-----------------+
|        PHP|               97|
|       Java|               92|
| JavaScript|               82|
|     Python|               35|
|Objective-C|               33|
|          C|               24|
|       Ruby|               17|
|     Delphi|                8|
|       Perl|                8|
|       Bash|                7|
+-----------+-----------------+

+-----------+-----------------+
|         PL|Mentioned_in_2012|
+-----------+-----------------+
|     

In [None]:
for year in pl_by_year.keys():
    save_path = f"/content/drive/MyDrive/BigData/My_work/LR2/Data/the_best_in{year}"

    if os.path.exists(save_path):
        print(f"Файл {save_path} уже существует.")
    else:
        pl_by_year[year].write.format("parquet").save(save_path)
        print(f"Сохранено в файл {save_path}.")

Сохранено в файл /content/drive/MyDrive/BigData/My_work/LR2/Data/the_best_in2010.
Сохранено в файл /content/drive/MyDrive/BigData/My_work/LR2/Data/the_best_in2011.
Сохранено в файл /content/drive/MyDrive/BigData/My_work/LR2/Data/the_best_in2012.
Сохранено в файл /content/drive/MyDrive/BigData/My_work/LR2/Data/the_best_in2013.
Сохранено в файл /content/drive/MyDrive/BigData/My_work/LR2/Data/the_best_in2014.
Сохранено в файл /content/drive/MyDrive/BigData/My_work/LR2/Data/the_best_in2015.
Сохранено в файл /content/drive/MyDrive/BigData/My_work/LR2/Data/the_best_in2016.
Сохранено в файл /content/drive/MyDrive/BigData/My_work/LR2/Data/the_best_in2017.
Сохранено в файл /content/drive/MyDrive/BigData/My_work/LR2/Data/the_best_in2018.
Сохранено в файл /content/drive/MyDrive/BigData/My_work/LR2/Data/the_best_in2019.
