In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 4 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [25]:
from pyspark import SparkConf
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, HashingTF, IDF, StopWordsRemover
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark import Row
import json

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("test_")
         .getOrCreate())

In [146]:
spark

In [13]:
dataset = spark.read.json("/labs/slaba02/DO_record_per_line.json")

In [9]:
dataset.show(2)

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|
+--------------------+--------------------+---+----+--------------------+--------------+
only showing top 2 rows



In [3]:
courses = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'], [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]

In [4]:
ids = [course[0] for course in courses]
ids

[23126, 21617, 16627, 11556, 16704, 13702]

In [5]:
# Рекомендованные курсы должны быть того же языка
languages = [course[1] for course in courses]
languages

['en', 'en', 'es', 'es', 'ru', 'ru']

In [6]:
tokenizer = Tokenizer(inputCol="desc", outputCol="words")

In [106]:
tokenizer.transform(dataset).show(2)

+--------------------+--------------------+---+----+--------------------+--------------+--------------------+
|                 cat|                desc| id|lang|                name|      provider|               words|
+--------------------+--------------------+---+----+--------------------+--------------+--------------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|[this, course, in...|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|[this, online, co...|
+--------------------+--------------------+---+----+--------------------+--------------+--------------------+
only showing top 2 rows



In [7]:
stop_words = StopWordsRemover.loadDefaultStopWords("russian") + StopWordsRemover.loadDefaultStopWords("english") + StopWordsRemover.loadDefaultStopWords("spanish")

In [8]:
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="words_filtered", stopWords=stop_words)

In [9]:
hasher = HashingTF(numFeatures=10000, binary=False, inputCol=swr.getOutputCol(), outputCol="word_vector_freq")

In [11]:
pipeline = Pipeline(stages=[
    tokenizer,
    swr,
    hasher,
    tfIdf
])

In [14]:
model = pipeline.fit(dataset)

In [15]:
predictions = model.transform(dataset)

In [26]:
joined = (
    predictions
    .alias("a")
    .where(F.col("id").isin(ids))
    .join(predictions.alias("b"), "lang", "inner")
    .where("a.id != b.id")
    .select("a.id", "b.id", "lang", "a.tfIdf", "b.tfIdf", "b.name")
    .cache()
)

In [27]:
joined.show(10)

+-----+---+----+--------------------+--------------------+--------------------+
|   id| id|lang|               tfIdf|               tfIdf|                name|
+-----+---+----+--------------------+--------------------+--------------------+
|21617|  4|  en|(10000,[213,360,5...|(10000,[36,42,63,...|Accounting Cycle:...|
|21617|  5|  en|(10000,[213,360,5...|(10000,[32,222,29...|American Counter ...|
|21617|  7|  en|(10000,[213,360,5...|(10000,[493,721,8...|Becoming a Dynami...|
|21617|  8|  en|(10000,[213,360,5...|(10000,[32,65,115...|           Bioethics|
|21617|  9|  en|(10000,[213,360,5...|(10000,[56,268,30...|College Foundatio...|
|21617| 10|  en|(10000,[213,360,5...|(10000,[1045,2044...|Digital Literacies I|
|21617| 11|  en|(10000,[213,360,5...|(10000,[87,157,15...|Digital Literacie...|
|21617| 12|  en|(10000,[213,360,5...|(10000,[161,164,8...|Digital Tools for...|
|21617| 13|  en|(10000,[213,360,5...|(10000,[26,1072,1...|Discover Your Val...|
|21617| 14|  en|(10000,[213,360,5...|(10

In [28]:
joined.count()

54310

In [19]:
def cosineSimilarity(v, u):
    return float(v.dot(u) / (v.norm(2) * u.norm(2)))

In [20]:
cosineSimilarityUDF = F.udf(cosineSimilarity, FloatType())

In [29]:
joined = joined.withColumn("cos", cosineSimilarityUDF(F.col("a.tfIdf"), F.col("b.tfIdf")))

In [30]:
joined.show(10)

+-----+---+----+--------------------+--------------------+--------------------+------------+
|   id| id|lang|               tfIdf|               tfIdf|                name|         cos|
+-----+---+----+--------------------+--------------------+--------------------+------------+
|21617|  4|  en|(10000,[213,360,5...|(10000,[36,42,63,...|Accounting Cycle:...| 0.050420523|
|21617|  5|  en|(10000,[213,360,5...|(10000,[32,222,29...|American Counter ...| 0.021780243|
|21617|  7|  en|(10000,[213,360,5...|(10000,[493,721,8...|Becoming a Dynami...| 0.005189441|
|21617|  8|  en|(10000,[213,360,5...|(10000,[32,65,115...|           Bioethics| 0.040991765|
|21617|  9|  en|(10000,[213,360,5...|(10000,[56,268,30...|College Foundatio...|  0.08992993|
|21617| 10|  en|(10000,[213,360,5...|(10000,[1045,2044...|Digital Literacies I| 0.016705962|
|21617| 11|  en|(10000,[213,360,5...|(10000,[87,157,15...|Digital Literacie...|  0.03036655|
|21617| 12|  en|(10000,[213,360,5...|(10000,[161,164,8...|Digital Tool

In [46]:
df_res = (
    joined
    .where("cos != 'NaN'")
    # по метрике (убывание) => по названию (лексикографически по возрастанию) => по возрастанию id.
    .withColumn("rn", F.row_number().over(Window.partitionBy("a.id").orderBy(F.desc("cos"), F.col("name"), F.col("b.id"))))
    .where("rn <= 10")
    .groupBy("a.id")
    .agg(F.collect_list("b.id").alias("id_list"))
    .collect()
)

In [48]:
json_data = {}

In [49]:
for row in df_res:
    json_data[row["id"]] = row["id_list"]

In [50]:
json_data

{23126: [13782, 13665, 24419, 20638, 2724, 25782, 2633, 2723, 13348, 15909],
 16627: [11431, 12660, 12247, 5687, 17964, 23515, 12863, 9598, 13550, 17961],
 13702: [864, 28074, 8300, 21079, 1110, 8313, 21025, 13057, 1144, 1041],
 16704: [1236, 1365, 8186, 1164, 18331, 796, 20308, 875, 8207, 8154],
 11556: [16488, 10447, 468, 22710, 19330, 13461, 10384, 23357, 13776, 21707],
 21617: [21609, 21608, 21616, 21492, 21624, 21623, 21630, 21628, 21508, 21703]}

In [51]:
with open("lab02.json", "w") as f:
    f.write(json.dumps(json_data))

In [52]:
spark.stop()