In [1]:
import os
import sys

os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
import json

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("test")
         .getOrCreate())

spark

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


# Выгрузка данных

In [2]:
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, IntegerType, ArrayType
from pyspark.ml.linalg import VectorUDT

In [3]:
schema = StructType([
    StructField("id", StringType()),
    StructField("desc", StringType()),
    StructField("cat", StringType()),
    StructField("name", StringType()),
    StructField("lang", StringType()),
    StructField("provider", StringType())
])

In [86]:
courses = spark.read.json('/labs/slaba02/DO_record_per_line.json', schema = schema)

In [87]:
courses.printSchema()

root
 |-- id: string (nullable = true)
 |-- desc: string (nullable = true)
 |-- cat: string (nullable = true)
 |-- name: string (nullable = true)
 |-- lang: string (nullable = true)
 |-- provider: string (nullable = true)



In [88]:
target_courses = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'], [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]
target_df = spark.createDataFrame(target_courses, schema=courses['id', 'lang', 'desc'].schema)
target_df.show(truncate=False)

+-----+----+------------------------------------------------------------------------------+
|id   |lang|desc                                                                          |
+-----+----+------------------------------------------------------------------------------+
|23126|en  |Compass - powerful SASS library that makes your life easier                   |
|21617|en  |Preparing for the AP* Computer Science A Exam — Part 2                        |
|16627|es  |Aprende Excel: Nivel Intermedio by Alfonso Rinsche                            |
|11556|es  |Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo|
|16704|ru  |Программирование на Lazarus                                                   |
|13702|ru  |Математическая экономика                                                      |
+-----+----+------------------------------------------------------------------------------+



In [89]:
stop_words = ['by', 'el', 'a', 'the', 'a', 'to', 'на', 'that', 'for']

# Установка необходимых ML библиотек

In [90]:
from pyspark.ml.linalg import VectorUDT, DenseVector
from pyspark.sql.types import StructType, StructField, DoubleType
from pyspark.sql import functions as f
from pyspark.ml.feature import Tokenizer, HashingTF, IDF, CountVectorizer
import re

# Обработка текстовых описаний

In [91]:
regex = re.compile(u'[\w\d]{2,}', re.U)

# @f.pandas_udf(StringType())
def word_tokenizer(text):
    text = " ".join(regex.findall(str(text).lower())).split(" ")
    return [word for word in text if word not in stop_words]

udf_word_tokenizer = f.udf(word_tokenizer, ArrayType(StringType()))

In [92]:
# courses = courses.withColumn('desc', f.concat(courses.desc, courses.name))

In [197]:
target_df = target_df.withColumn('desc', udf_word_tokenizer('desc'))
courses = courses.withColumn('desc', udf_word_tokenizer('desc'))

target_df_joined = target_df.join(courses.select(f.col('id'), f.col('lang').alias('lang_2') , f.col('desc').alias('desc_2'))
                                  , on='id')
target_df = target_df_joined.select('id', 'lang', f.col('desc_2').alias('desc'))

In [198]:
tf = HashingTF(numFeatures=10000, inputCol='desc', outputCol='features_tf')
idf = IDF(inputCol='features_tf', outputCol='features_tfidf', minDocFreq=1)

In [199]:
tf_courses = tf.transform(courses)
idf_courses = idf.fit(tf_courses)
tfidf_courses = idf_courses.transform(tf_courses)
tfidf_courses.printSchema()

root
 |-- id: string (nullable = true)
 |-- desc: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- cat: string (nullable = true)
 |-- name: string (nullable = true)
 |-- lang: string (nullable = true)
 |-- provider: string (nullable = true)
 |-- features_tf: vector (nullable = true)
 |-- features_tfidf: vector (nullable = true)



In [200]:
tf_target = tf.transform(target_df)
tfidf_target = idf_courses.transform(tf_target)
tfidf_target.printSchema()

root
 |-- id: string (nullable = true)
 |-- lang: string (nullable = true)
 |-- desc: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features_tf: vector (nullable = true)
 |-- features_tfidf: vector (nullable = true)



In [201]:
tfidf_target.show(10)

+-----+----+--------------------+--------------------+--------------------+
|   id|lang|                desc|         features_tf|      features_tfidf|
+-----+----+--------------------+--------------------+--------------------+
|23126|  en|[improve, your, s...|(10000,[87,91,128...|(10000,[87,91,128...|
|21617|  en|[an, introduction...|(10000,[17,128,16...|(10000,[17,128,16...|
|16627|  es|[hazte, más, empl...|(10000,[55,76,192...|(10000,[55,76,192...|
|11556|  es|[la, transformaci...|(10000,[249,522,5...|(10000,[249,522,5...|
|16704|  ru|[курсе, рассматри...|(10000,[381,1144,...|(10000,[381,1144,...|
|13702|  ru|[математическая, ...|(10000,[310,942,2...|(10000,[310,942,2...|
+-----+----+--------------------+--------------------+--------------------+



In [202]:
def cos_sim(target, u):
    return float(target.dot(u) / (target.norm(2) * u.norm(2)))

udf_cos_sim = f.udf(cos_sim, DoubleType())

In [203]:
tfidf_target

DataFrame[id: string, lang: string, desc: array<string>, features_tf: vector, features_tfidf: vector]

In [204]:
tfidf_target = tfidf_target.select(f.col("id").alias('target_id'), 
                                   f.col('lang'), 
                                   f.col('features_tfidf').alias('target_features'),
                                   )
tfidf_courses = tfidf_courses.select(f.col("id"), 
                                   f.col('lang'), 
                                   f.col('features_tfidf'),
                                    f.col('name'))
joined = tfidf_target.join(tfidf_courses, on='lang', how='inner')

In [205]:
joined.show(5)

+----+---------+--------------------+-----+--------------------+--------------------+
|lang|target_id|     target_features|   id|      features_tfidf|                name|
+----+---------+--------------------+-----+--------------------+--------------------+
|  en|    23126|(10000,[87,91,128...|16308|(10000,[505,1387,...|Up and Running wi...|
|  en|    23126|(10000,[87,91,128...|16309|(10000,[128,201,9...|Up and Running wi...|
|  en|    23126|(10000,[87,91,128...|16310|(10000,[505,706,1...|Up and Running wi...|
|  en|    23126|(10000,[87,91,128...|16311|(10000,[281,1089,...|Up and Running wi...|
|  en|    23126|(10000,[87,91,128...|16312|(10000,[1239,1445...|Up and Running wi...|
+----+---------+--------------------+-----+--------------------+--------------------+
only showing top 5 rows



In [206]:
result_df = joined.select('target_id', 'id', 'lang', 'name', (udf_cos_sim('target_features', "features_tfidf")).alias('similarity'))
result_df = result_df.dropna().cache()

In [207]:
result_df.show(5)

+---------+-----+----+--------------------+--------------------+
|target_id|   id|lang|                name|          similarity|
+---------+-----+----+--------------------+--------------------+
|    23126|16308|  en|Up and Running wi...|0.010768667796949046|
|    23126|16309|  en|Up and Running wi...|0.004397617582649494|
|    23126|16310|  en|Up and Running wi...|0.018393138569793823|
|    23126|16311|  en|Up and Running wi...| 0.00758436221389505|
|    23126|16312|  en|Up and Running wi...|5.733964272051451E-4|
+---------+-----+----+--------------------+--------------------+
only showing top 5 rows



In [208]:
targets = result_df.select('target_id').distinct().collect()

In [209]:
result = dict()
for course in range(6):
    result[targets[course]['target_id']] = result_df[(result_df['target_id'] == targets[course]['target_id']) & (result_df['id'] != targets[course]['target_id'])] \
                                            .dropDuplicates(['name']) \
                                            .select('id', 'similarity') \
                                            .sort(result_df.similarity.desc(), result_df.id.asc(), result_df.name.asc()).limit(10).collect()

In [211]:
result_f = dict()
for key in result.keys():
    result_f[key] = list()
    for el in range(10):
        
        result_f[key].append(result[key][el]['id'])

In [212]:
import json

In [213]:
with open('lab02.json', 'w') as fp:
    json.dump(result_f, fp)