In [1]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--packages com.databricks:spark-csv_2.10:1.2.0 pyspark-shell'
os.environ["PYSPARK_PYTHON"]='python3'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.4-src.zip'))

exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.3.0
      /_/

Using Python version 3.6.4 (default, Jan 28 2018 00:00:00)
SparkSession available as 'spark'.


[The author's github-->](https://github.com/osboo)

# Imports

In [2]:
from pyspark.sql import functions as F
from pyspark.sql.functions import col, udf, desc, lit

from pyspark.sql.types import *

In [3]:
from math import sqrt

import math

In [4]:
import json
from json import encoder
#encoder.FLOAT_REPR = lambda o: format(o, '.4f')

In [5]:
import pandas as pd

# Input data

In [6]:
u_data = spark.read.csv('/labs/lab08data/u.data', sep='\t')

**u_data format**:

user id | item id | rating | timestamp

**u_item format**

movie id | movie title | release date | video release date |
IMDb URL | unknown | Action | Adventure | Animation |
Children's | Comedy | Crime | Documentary | Drama | Fantasy |
Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi |
Thriller | War | Western |

The last 19 fields are the genres, a 1 indicates the movie
is of that genre, a 0 indicates it is not; movies can be in
several genres at once.
The movie ids are the ones used in the u.data data set.


In [7]:
u_data = u_data.selectExpr("_c0 as user_id", "_c1 as item_id", "_c2 as rating",  "_c3 as timestamp")

In [8]:
u_data.show(n=5)

+-------+-------+------+---------+
|user_id|item_id|rating|timestamp|
+-------+-------+------+---------+
|    196|    242|     3|881250949|
|    186|    302|     3|891717742|
|     22|    377|     1|878887116|
|    244|     51|     2|880606923|
|    166|    346|     1|886397596|
+-------+-------+------+---------+
only showing top 5 rows



In [9]:
u_item = spark.read.csv('/labs/lab08data/u.item', sep='|')

In [10]:
u_item = u_item.selectExpr("_c0 as movie_id", "_c1 as movie_title")

In [11]:
u_item.show(n=5)

+--------+-----------------+
|movie_id|      movie_title|
+--------+-----------------+
|       1| Toy Story (1995)|
|       2| GoldenEye (1995)|
|       3|Four Rooms (1995)|
|       4|Get Shorty (1995)|
|       5|   Copycat (1995)|
+--------+-----------------+
only showing top 5 rows



In [12]:
target_id = 450

# Задача

## Часть 1. Основные характеристики данных MovieLens:

Найдите количество всех пользователей и количество всех фильмов в данных (общее число пользователей и фильмов в датасете).

In [13]:
all_users_number = u_data.select(u_data.user_id).distinct().count()

In [14]:
all_users_number

943

In [15]:
all_films_number = u_item.select(u_item.movie_id).distinct().count()

In [16]:
all_films_number

1682

Сколько пользователь в среднем ставит рейтингов? Подсчитать количество рейтингов / количество пользователей. Поле average_user_ratings.

In [17]:
all_ratings_number = u_data.select(u_data.rating).count()

In [18]:
all_ratings_number

100000

In [19]:
average_user_ratings = all_ratings_number * 1.0 / all_users_number

In [20]:
average_user_ratings

106.04453870625663

Сколько фильм в среднем имеет рейтингов? Подсчитать количество рейтингов / количество фильмов. Поле average_film_ratings.

In [21]:
average_film_ratings = all_ratings_number * 1.0 / all_films_number

In [22]:
average_film_ratings

59.45303210463734

Найдите процент заполненных ячеек в данных: количество рейтингов / (количество пользователей * количество фильмов). Поле completeness.

In [23]:
completeness = all_ratings_number * 1.0 / all_users_number / all_films_number

In [24]:
completeness

0.06304669364224533

## Часть 2. User-user CF:

Для каждого пользователя найдите его средний рейтинг (сумма рейтингов пользователя/количество рейтингов пользователя). Здесь Ia — множество фильмов, по которым у пользователя есть рейтинги rui. Здесь и далее |Ia| обозначает количество элементов в множестве Ia.

<img src="https://camo.githubusercontent.com/51b4bdca1c1bfcd4f9f6228da4d63eb017a920a6/687474703a2f2f646174612e6e657770726f6c61622e636f6d2f7075626c69632d6e657770726f6c61622d636f6d2f6c61626130385f725f615f6176672e706e67" alt="Drawing" style="width: 100px;"/>

In [25]:
average_rating = u_data.groupBy(u_data.user_id).agg(F.avg(u_data.rating).alias('avg_rating'))

In [26]:
average_rating.show(n=5)

+-------+------------------+
|user_id|        avg_rating|
+-------+------------------+
|    296|4.1768707482993195|
|    467|3.6818181818181817|
|    691|           4.21875|
|    675|3.7058823529411766|
|    829|          3.546875|
+-------+------------------+
only showing top 5 rows



In [29]:
average_rating_target_value = average_rating\
   .filter(average_rating['user_id'] == target_id)\
   .take(1)[0]['avg_rating']

In [30]:
average_rating_target_value

3.8648148148148147

Для выданного вам пользователя (ID в личном кабинете):

Посчитайте меру близости Пирсона выданного вам пользователя со всеми остальными пользователями. Обратите внимание, что корреляция Пирсона считается только на пересечении, то есть вклад вносят только фильмы, оцененные совместно (Ia, Iu — множества оцененных пользователями a и u фильмов). Корреляция с константой (ситуация, когда у пользователя все оценки одинаковые) равна нулю.

<img src="https://camo.githubusercontent.com/9cf99dc94ec99d4f501f07230404d22473ba6fea/687474703a2f2f646174612e6e657770726f6c61622e636f6d2f7075626c69632d6e657770726f6c61622d636f6d2f6c61626130385f70656172736f6e2e706e67" alt="Drawing" style="width: 400px;"/>

In [31]:
target_user_ratings = u_data.filter(u_data.user_id == target_id).select(
    u_data.item_id,
    u_data.rating.alias('target_rating'))

In [32]:
target_user_ratings.show(n=5)

+-------+-------------+
|item_id|target_rating|
+-------+-------------+
|    470|            5|
|    783|            3|
|   1147|            4|
|    100|            4|
|     58|            3|
+-------+-------------+
only showing top 5 rows



In [33]:
target_user_ratings.select(target_user_ratings.item_id).distinct().count()

540

In [34]:
target_items = [row.item_id for row in target_user_ratings.select(target_user_ratings.item_id).distinct().collect()]

In [35]:
u_data = u_data.select('user_id',
                        'item_id',
                        'rating',
                        F.when(u_data.item_id.isin(target_items), 1).otherwise(0).alias('is_in_target'))

In [36]:
u_data.show(n=5)

+-------+-------+------+------------+
|user_id|item_id|rating|is_in_target|
+-------+-------+------+------------+
|    196|    242|     3|           0|
|    186|    302|     3|           1|
|     22|    377|     1|           0|
|    244|     51|     2|           1|
|    166|    346|     1|           0|
+-------+-------+------+------------+
only showing top 5 rows



In [37]:
u_data_common = u_data.filter(u_data.is_in_target == 1)

In [38]:
u_data_common = u_data_common.join(target_user_ratings,
                      on=u_data_common.item_id == target_user_ratings.item_id,
                      how='inner').select(u_data_common.user_id,
                                          u_data_common.item_id,
                                          u_data_common.rating,
                                          target_user_ratings.target_rating)

In [39]:
u_data_common.show(n=5)

+-------+-------+------+-------------+
|user_id|item_id|rating|target_rating|
+-------+-------+------+-------------+
|    186|    302|     3|            5|
|    244|     51|     2|            4|
|    298|    474|     4|            5|
|    115|    265|     2|            5|
|    253|    465|     5|            4|
+-------+-------+------+-------------+
only showing top 5 rows



In [40]:
s = u_data_common.join(
    average_rating, on='user_id' 
).withColumn('avg_rating_target', lit(average_rating_target_value))

In [41]:
s.show(n=5)

+-------+-------+------+-------------+------------------+------------------+
|user_id|item_id|rating|target_rating|        avg_rating| avg_rating_target|
+-------+-------+------+-------------+------------------+------------------+
|    186|    302|     3|            5|3.4130434782608696|3.8648148148148147|
|    244|     51|     2|            4|3.6512605042016806|3.8648148148148147|
|    298|    474|     4|            5| 4.031496062992126|3.8648148148148147|
|    115|    265|     2|            5|3.9347826086956523|3.8648148148148147|
|    253|    465|     5|            4| 3.979381443298969|3.8648148148148147|
+-------+-------+------+-------------+------------------+------------------+
only showing top 5 rows



In [42]:
covariance = F.sum( (s['target_rating'] - s['avg_rating_target']) * (s['rating'] - s['avg_rating']) )

In [43]:
sigma_target = F.sqrt( F.sum( (s['target_rating'] - s['avg_rating_target']) * (s['target_rating'] - s['avg_rating_target']) ) )

In [44]:
sigma_user = F.sqrt( F.sum( (s['rating'] - s['avg_rating']) * (s['rating'] - s['avg_rating']) ) )

In [45]:
pearson_df = s.groupBy('user_id').agg(
     (covariance / sigma_target / sigma_user).alias('corr_with_target')
)

In [46]:
pearson_df.show(n=5)

+-------+-------------------+
|user_id|   corr_with_target|
+-------+-------------------+
|    296|  0.321953212360758|
|    467| 0.3133498222995453|
|    691|0.45668991285499855|
|    675| 0.5375077311703701|
|    829|0.09159053662753726|
+-------+-------------------+
only showing top 5 rows



pearson_df = pearson_df.select('user_id', F.when(F.isnan(pearson_df['corr_with_target']), 0.0).otherwise(pearson_df['corr_with_target']).alias('corr_with_target'))

In [47]:
pearson_df.sort(desc('corr_with_target')).show(n=5)

+-------+------------------+
|user_id|  corr_with_target|
+-------+------------------+
|    450|0.9999999999999998|
|    531|0.8150616162958754|
|    596|0.7839397071861639|
|    631|0.7657052985051976|
|    544| 0.734355793439825|
+-------+------------------+
only showing top 5 rows



Посчитайте поправочный коэффициент для корреляции Пирсона на нехватку данных:

<img src="https://camo.githubusercontent.com/3422e0424837cd1ca9b6ff3ccf556ae3fbdb4875/687474703a2f2f646174612e6e657770726f6c61622e636f6d2f7075626c69632d6e657770726f6c61622d636f6d2f6c61626130385f7265675f636f65662e706e67" alt="Drawing" style="width: 200px;"/>

In [48]:
intersection_power = u_data.groupby('user_id').agg(F.sum('is_in_target').alias('intersection_power'))

In [49]:
intersection_power.show(n=5)

+-------+------------------+
|user_id|intersection_power|
+-------+------------------+
|    296|                95|
|    467|                20|
|    691|                27|
|    675|                17|
|    829|                32|
+-------+------------------+
only showing top 5 rows



In [50]:
def calculate_missing_data_coeff(intersection_power):
    result = min(intersection_power * 1.0 / 50, 1.0)
    if math.isnan(result):
        return 0.0
    return result

In [51]:
calculate_missing_data_coeff_udf = udf(calculate_missing_data_coeff, returnType=DoubleType())

In [52]:
data_miss_coeffs = intersection_power.select('user_id', calculate_missing_data_coeff_udf('intersection_power').alias('coeff'))

In [53]:
data_miss_coeffs.show(n=5)

+-------+-----+
|user_id|coeff|
+-------+-----+
|    296|  1.0|
|    467|  0.4|
|    691| 0.54|
|    675| 0.34|
|    829| 0.64|
+-------+-----+
only showing top 5 rows



In [54]:
pearson_adjusted = pearson_df.join(
    data_miss_coeffs,
    on='user_id').select('user_id', (data_miss_coeffs.coeff * pearson_df.corr_with_target).alias('adjusted_corr'))

In [55]:
pearson_adjusted.show(n=5)

+-------+-------------------+
|user_id|      adjusted_corr|
+-------+-------------------+
|    296|  0.321953212360758|
|    467|0.12533992891981813|
|    691|0.24661255294169923|
|    675|0.18275262859792585|
|    829|0.05861794344162385|
+-------+-------------------+
only showing top 5 rows



Найдите 30 ближайших пользователей-соседей данного пользователя (pearson_neighbours), используя поправленную корреляцию Пирсона

<img src="https://camo.githubusercontent.com/3eb5eaf1fe854c9e9f3769d0693cd39925a4eee4/687474703a2f2f646174612e6e657770726f6c61622e636f6d2f7075626c69632d6e657770726f6c61622d636f6d2f6c61626130385f70656172736f6e5f7265675f636f65662e706e67" alt="Drawing" style="width: 600px;"/>

In [56]:
pearson_neighbours = pearson_adjusted.sort(desc('adjusted_corr')).limit(31).filter(pearson_adjusted['user_id'] != target_id)

In [57]:
pearson_neighbours.show(n=30)

+-------+-------------------+
|user_id|      adjusted_corr|
+-------+-------------------+
|    676| 0.5821028120925269|
|    717| 0.4789577727155448|
|     37| 0.4657835535024892|
|    392| 0.4562689486068854|
|    913|0.45023721265337413|
|    223| 0.4489437695459474|
|    600| 0.4485246966206234|
|    721| 0.4444237073588581|
|    838|0.44249581559698503|
|    933| 0.4358564231732711|
|     22| 0.4320161621327069|
|    610| 0.4306770303685678|
|    654| 0.4265634199155747|
|    615| 0.4254387883213246|
|     82|0.41657913205322117|
|    922|0.41352743092168226|
|    323| 0.4066081544307002|
|    715|0.40444251565511524|
|    806| 0.4030703460284211|
|    826| 0.4027635095201348|
|    394| 0.4013196832641033|
|    773|0.40082392641382486|
|    663|  0.399404974853165|
|     26|0.39627067473094857|
|     70|0.39150914724756347|
|    445|0.38976260680486696|
|    106|0.38752111202261746|
|    464| 0.3847159354040692|
|    836| 0.3820278139930449|
|    830|0.37233666915504304|
+-------+-

Дальше мы будем использовать только этих соседей для прогноза. Заметим, что строя предсказание для некоторого фильма, мы из фиксированных 30 ближайших пользователей-соседей выбираем только тех, у кого есть оценки для данного фильма, то есть используемых пользователей-соседей для прогноза может быть сильно меньше

Для всех фильмов найдите прогноз оценки по формуле ниже. Здесь N(a) — множество пользователей-соседей, s(a,u) — мера близости пользователей из предыдущих пунктов, |s(a,u)| — модуль меры близости

<img src="https://camo.githubusercontent.com/eb7ac8c08051ade220eab5843a26f2a7717e01a9/687474703a2f2f646174612e6e657770726f6c61622e636f6d2f7075626c69632d6e657770726f6c61622d636f6d2f6c61626130385f757365725f757365725f63662e706e67" alt="Drawing" style="width: 300px;"/>

In [58]:
u_item.show(n=5)

+--------+-----------------+
|movie_id|      movie_title|
+--------+-----------------+
|       1| Toy Story (1995)|
|       2| GoldenEye (1995)|
|       3|Four Rooms (1995)|
|       4|Get Shorty (1995)|
|       5|   Copycat (1995)|
+--------+-----------------+
only showing top 5 rows



In [59]:
unrated_films = u_item.select(
    'movie_id',
    'movie_title',
    F.when(u_item.movie_id.isin(target_items), 0).otherwise(1).alias('is_unrated')).filter('is_unrated = 1').select(
    'movie_id',
    'movie_title')

In [60]:
unrated_films.show(n=5)

+--------+--------------------+
|movie_id|         movie_title|
+--------+--------------------+
|       5|      Copycat (1995)|
|       6|Shanghai Triad (Y...|
|       8|         Babe (1995)|
|       9|Dead Man Walking ...|
|      14|  Postino, Il (1994)|
+--------+--------------------+
only showing top 5 rows



In [61]:
u_data.show(n=5)

+-------+-------+------+------------+
|user_id|item_id|rating|is_in_target|
+-------+-------+------+------------+
|    196|    242|     3|           0|
|    186|    302|     3|           1|
|     22|    377|     1|           0|
|    244|     51|     2|           1|
|    166|    346|     1|           0|
+-------+-------+------+------------+
only showing top 5 rows



In [62]:
average_rating.filter(average_rating['user_id'] == target_id).take(1)[0]

Row(user_id='450', avg_rating=3.8648148148148147)

In [63]:
t = unrated_films.join(
    u_data, on=u_data['item_id'] == unrated_films['movie_id'] # get unrated u_data
).join(
    average_rating, on=average_rating['user_id'] == u_data['user_id'] # get average ratings for each user
).join(
    pearson_neighbours, on='user_id' # get only neighbours u_data
).select('movie_id',
         'movie_title',
         pearson_neighbours['user_id'].alias('neighbour_id'),
         'rating',
         pearson_neighbours['adjusted_corr'].alias('corr'),
         'avg_rating'
).withColumn('avg_rating_target', lit(average_rating_target_value))

In [64]:
t.show(n=5)

+--------+--------------------+------------+------+-------------------+------------------+------------------+
|movie_id|         movie_title|neighbour_id|rating|               corr|        avg_rating| avg_rating_target|
+--------+--------------------+------------+------+-------------------+------------------+------------------+
|     377| Heavyweights (1994)|          22|     1| 0.4320161621327069|         3.3515625|3.8648148148148147|
|    1134|Get on the Bus (1...|          82|     2|0.41657913205322117|3.0714285714285716|3.8648148148148147|
|     930|Chain Reaction (1...|          26|     2|0.39627067473094857|  2.94392523364486|3.8648148148148147|
|     746|  Real Genius (1985)|          70|     3|0.39150914724756347|3.4656488549618323|3.8648148148148147|
|    1015|       Shiloh (1997)|          26|     3|0.39627067473094857|  2.94392523364486|3.8648148148148147|
+--------+--------------------+------------+------+-------------------+------------------+------------------+
only showi

In [65]:
r = t.groupBy('movie_id').agg((
    F.first(t['avg_rating_target']) + F.sum(t['corr'] * (t['rating'] - t['avg_rating'])) / F.sum(t['corr'])
).alias('rating_prediction')
).withColumn('movie_id', t['movie_id'].cast(IntegerType()))

In [66]:
r.show()

+--------+------------------+
|movie_id| rating_prediction|
+--------+------------------+
|     829|2.8796296296296298|
|     675| 4.400332586665641|
|     944|2.4815812818806835|
|     919|4.3257379118475825|
|    1265| 3.329100529100529|
|     666| 2.000737144911902|
|    1528|3.8796296296296298|
|     124|3.7613844867700865|
|     447|2.7346950909469507|
|     475| 4.096542080232443|
|     718| 4.746535244922342|
|     740| 2.793386243386243|
|     544| 4.155982123262637|
|     581| 2.329100529100529|
|     577|2.1162945835168916|
|     334|2.2453616454139285|
|     975| 2.702675788603072|
|     743|2.4815812818806835|
|    1008|3.4045702310771757|
|     886|3.3338083416820083|
+--------+------------------+
only showing top 20 rows



In [67]:
pearson_top10 = r.sort(['rating_prediction', 'movie_id'], ascending=[0, 1]).limit(10)

In [68]:
user_user_advise = pearson_top10.toPandas()
user_user_advise

Unnamed: 0,movie_id,rating_prediction
0,1591,5.87963
1,1367,5.585312
2,1529,5.585312
3,19,5.342088
4,990,5.329101
5,1021,5.284317
6,916,5.280399
7,856,5.221958
8,888,5.155137
9,1137,5.155137


In [69]:
neighbours = [int(row.user_id) for row in pearson_neighbours.select('user_id').collect()]

In [70]:
advise = [row.movie_id for row in pearson_top10.select('movie_id').collect()]

In [71]:
d = {
    'average_film_ratings': average_film_ratings,
    'average_user_ratings': average_user_ratings,
    'completeness': completeness,
    'pearson_neighbours': neighbours,
    'pearson_top10': advise
}

In [72]:
advise

[1591, 1367, 1529, 19, 990, 1021, 916, 856, 888, 1137]

In [73]:
with open('../lab08.json', 'w') as f:
    json.dump(d, f)

## Часть 3. Базовые предикторы:

Глобальное среднее 𝞵 (average_rating) по всему датасету. Сумма всех оценок по всем фильмам / Количество всех оценок по всем фильмам.

In [74]:
mu = u_data.select((F.sum('rating') * 1.0 / all_ratings_number).alias('mu')).collect()[0].mu

In [75]:
mu

3.52986

Базовый предиктор для каждого пользователя (суммирование по фильмам, оцененным данным пользователем). Здесь Ia — множество фильмов, по которым у пользователя есть рейтинги, а |Ia| — их количество.

<img src="https://camo.githubusercontent.com/850e16e503d0516733266579d1fdb8ed5cb13ad9/687474703a2f2f646174612e6e657770726f6c61622e636f6d2f7075626c69632d6e657770726f6c61622d636f6d2f6c6162613038735f626173655f752e706e67" alt="Drawing" style="width: 300px;"/>

In [76]:
bu = u_data.groupBy('user_id').agg( (1.0 / (F.count('item_id') + 10) * F.sum(u_data['rating'] - mu) ).alias('bu'))

In [77]:
bu.show(n=5)

+-------+--------------------+
|user_id|                  bu|
+-------+--------------------+
|    296|  0.6057998726114655|
|    467| 0.12381777777777758|
|    691|  0.5248685714285717|
|    675|  0.1360172727272726|
|    829|0.014715675675675487|
+-------+--------------------+
only showing top 5 rows



Базовый предиктор для каждого фильма (суммирование по пользователям, поставившим оценку данному фильму). Здесь Ui — множество пользователей, которые оценили данный фильм, а |Ui| — их количество.

<img src="https://camo.githubusercontent.com/1b01851726c5f7c5e20b373f8a26e858193643e3/687474703a2f2f646174612e6e657770726f6c61622e636f6d2f7075626c69632d6e657770726f6c61622d636f6d2f6c6162613038735f626173655f692e706e67" alt="Drawing" style="width: 300px;"/>

In [78]:
bi = u_data.join(
    bu, on='user_id'
).groupBy('item_id').agg((1.0 / (F.count('user_id') + 25) * F.sum(u_data['rating'] - bu['bu'] - mu)).alias('bi'))

In [79]:
bi.show(n=5)

+-------+--------------------+
|item_id|                  bi|
+-------+--------------------+
|    829| -0.4270513400199922|
|   1436|-0.04295878652633463|
|    467| 0.16690701840272296|
|    691| 0.05546248651815944|
|   1090| -0.5930556738185061|
+-------+--------------------+
only showing top 5 rows



Базовый предиктор для каждого пользователя и каждого фильма:

<img src="https://camo.githubusercontent.com/2feb726fa37d1bddbb5cfd55d02bb104cc514840/687474703a2f2f646174612e6e657770726f6c61622e636f6d2f7075626c69632d6e657770726f6c61622d636f6d2f6c6162613038735f626173655f75692e706e67" alt="Drawing" style="width: 300px;"/>

In [80]:
%%time
base_predictors = u_data.join(
    bu, on='user_id'
).join(
    bi, on='item_id'
).withColumn(
    'mu', lit(mu)
).withColumn(
    'base_predictor', col('mu') + col('bu') + col('bi')
)

CPU times: user 988 µs, sys: 1.67 ms, total: 2.66 ms
Wall time: 78.5 ms


In [81]:
%%time
base_predictors.show(n=5)

+-------+-------+------+------------+--------------------+-------------------+-------+------------------+
|item_id|user_id|rating|is_in_target|                  bu|                 bi|     mu|    base_predictor|
+-------+-------+------+------------+--------------------+-------------------+-------+------------------+
|   1090|     56|     3|           0| 0.10617350253807144|-0.5930556738185061|3.52986|3.0429778287195655|
|   1090|    303|     1|           0|-0.16083449392712493|-0.5930556738185061|3.52986|2.7759698322543693|
|   1090|    268|     2|           0| -0.5703966863905363|-0.5930556738185061|3.52986| 2.366407639790958|
|   1090|    145|     2|           0|-0.18845325153374187|-0.5930556738185061|3.52986|2.7483510746477524|
|   1090|    320|     3|           0|  0.1792778048780491|-0.5930556738185061|3.52986|3.1160821310595432|
+-------+-------+------+------------+--------------------+-------------------+-------+------------------+
only showing top 5 rows

CPU times: user 0 ns,

In [82]:
%%time
base_predictors[base_predictors.user_id == target_id].sort(col('item_id').cast(IntegerType())).show()

+-------+-------+------+------------+------------------+--------------------+-------+------------------+
|item_id|user_id|rating|is_in_target|                bu|                  bi|     mu|    base_predictor|
+-------+-------+------+------------+------------------+--------------------+-------+------------------+
|      1|    450|     4|           1|0.3288647272727238| 0.29292488601631045|3.52986| 4.151649613289035|
|      2|    450|     4|           1|0.3288647272727238|-0.21303169608613212|3.52986|3.6456930311865916|
|      3|    450|     4|           1|0.3288647272727238| -0.3202843059213391|3.52986| 3.538440421351385|
|      4|    450|     3|           1|0.3288647272727238|-0.01991182388493691|3.52986| 3.838812903387787|
|      7|    450|     4|           1|0.3288647272727238|  0.2330837545391526|3.52986| 4.091808481811876|
|     10|    450|     4|           1|0.3288647272727238| 0.20804857270047783|3.52986| 4.066773299973201|
|     11|    450|     5|           1|0.3288647272727238

## Часть 4. Item-item CF:

In [83]:
target_id

450

Вычесть из всех рейтингов rui базовый предиктор bui из пункта 4, часть 3 (для всей таблицы рейтингов). Если рейтинга нет, то можно поставить 0.

In [84]:
unbiased_ratings = base_predictors.withColumn('R', col('rating') - col('base_predictor')).select(
    'item_id', 'user_id', 'rating', 'R')

In [85]:
unbiased_ratings.show(n=5)

+-------+-------+------+--------------------+
|item_id|user_id|rating|                   R|
+-------+-------+------+--------------------+
|   1090|     56|     3|-0.04297782871956546|
|   1090|    303|     1| -1.7759698322543693|
|   1090|    268|     2|-0.36640763979095814|
|   1090|    145|     2| -0.7483510746477524|
|   1090|    320|     3|-0.11608213105954324|
+-------+-------+------+--------------------+
only showing top 5 rows



Найдите попарные меры близости (косинус) для всех фильмов, используя очищенные оценки из пункта 1, часть 4. Суммирование идет по всем пользователям.

<img src="https://camo.githubusercontent.com/c54dcc01e0cb1f2f1b314f18b71ccc58477d0eca/687474703a2f2f646174612e6e657770726f6c61622e636f6d2f7075626c69632d6e657770726f6c61622d636f6d2f6c6162613038735f636f73696e655f6974656d732e706e67" alt="Drawing" style="width: 300px;"/>

In [86]:
%%time
item_norms = unbiased_ratings.groupBy('item_id').agg((F.sqrt(F.sum( unbiased_ratings['R'] * unbiased_ratings['R'] ))).alias('R_norm'))

CPU times: user 292 µs, sys: 4.68 ms, total: 4.97 ms
Wall time: 56.9 ms


In [87]:
%%time
item_norms.show(n=5)

+-------+------------------+
|item_id|            R_norm|
+-------+------------------+
|   1090| 6.819856286480186|
|   1159| 3.759215459245026|
|   1436|0.8965977998605799|
|   1512|3.5276459597877206|
|   1572| 0.824192410668307|
+-------+------------------+
only showing top 5 rows

CPU times: user 3.12 ms, sys: 0 ns, total: 3.12 ms
Wall time: 1.99 s


In [88]:
%%time
unbiased_ratings = unbiased_ratings.join(item_norms, on='item_id')

CPU times: user 594 µs, sys: 0 ns, total: 594 µs
Wall time: 88.8 ms


In [89]:
%%time
unbiased_ratings.show(n=5)

+-------+-------+------+--------------------+-----------------+
|item_id|user_id|rating|                   R|           R_norm|
+-------+-------+------+--------------------+-----------------+
|   1090|     56|     3|-0.04297782871956546|6.819856286480186|
|   1090|    303|     1| -1.7759698322543693|6.819856286480186|
|   1090|    268|     2|-0.36640763979095814|6.819856286480186|
|   1090|    145|     2| -0.7483510746477524|6.819856286480186|
|   1090|    320|     3|-0.11608213105954324|6.819856286480186|
+-------+-------+------+--------------------+-----------------+
only showing top 5 rows

CPU times: user 0 ns, sys: 1.99 ms, total: 1.99 ms
Wall time: 2.19 s


In [90]:
%%time
unbiased_ratings.select('item_id').distinct().count()

CPU times: user 0 ns, sys: 1.26 ms, total: 1.26 ms
Wall time: 4.05 s


1682

In [91]:
%%time
i = unbiased_ratings.toDF('i', 'user_id', 'rating', 'R_i', 'R_norm_i')
j = unbiased_ratings.toDF('j', 'user_id', 'rating', 'R_j', 'R_norm_j')

CPU times: user 0 ns, sys: 3.1 ms, total: 3.1 ms
Wall time: 7.45 ms


In [92]:
%%time
s = i.join(
    j, on=(i.user_id == j.user_id)
).filter((i.i.cast(IntegerType()) < j.j.cast(IntegerType())))

CPU times: user 782 µs, sys: 2.39 ms, total: 3.17 ms
Wall time: 97.2 ms


In [93]:
%%time
s.limit(10).toPandas()

CPU times: user 11.4 ms, sys: 106 µs, total: 11.5 ms
Wall time: 16.9 s


Unnamed: 0,i,user_id,rating,R_i,R_norm_i,j,user_id.1,rating.1,R_j,R_norm_j
0,15,168,5,1.449816,15.69754,1278,168,3,-0.395918,4.258124
1,15,168,5,1.449816,15.69754,300,168,5,1.537886,19.619104
2,15,168,5,1.449816,15.69754,1016,168,5,1.693435,11.237004
3,15,168,5,1.449816,15.69754,619,168,3,-0.156058,6.63603
4,15,168,5,1.449816,15.69754,819,168,4,1.101109,5.966964
5,15,168,5,1.449816,15.69754,259,168,2,-0.702001,14.554239
6,15,168,5,1.449816,15.69754,930,168,3,0.20378,8.148895
7,15,168,5,1.449816,15.69754,222,168,5,1.538985,17.211553
8,15,168,5,1.449816,15.69754,225,168,5,2.082636,10.212448
9,15,168,5,1.449816,15.69754,235,168,2,-0.761611,16.432934


In [94]:
%%time
s2 = s.groupBy(['i', 'j']).agg(
    (F.sum(s['R_i'] * s['R_j']) / F.first(s['R_norm_i']) / F.first(s['R_norm_j']) ).alias('cos')
)

CPU times: user 0 ns, sys: 2.93 ms, total: 2.93 ms
Wall time: 259 ms


In [95]:
%%time
s2_swap = s2.select(s2['j'].alias('i'), s2['i'].alias('j'), 'cos')

CPU times: user 804 µs, sys: 1.69 ms, total: 2.49 ms
Wall time: 7.92 ms


In [96]:
%%time
cos_m = s2.unionAll(s2_swap)

CPU times: user 0 ns, sys: 737 µs, total: 737 µs
Wall time: 14.9 ms


In [97]:
%%time
cos_m.show(n=5)

+---+---+--------------------+
|  i|  j|                 cos|
+---+---+--------------------+
|168|429| 0.03112394874557793|
| 56|151|0.027668171110027763|
|323|682| 0.07774651863192829|
|319|905|0.014361342031837762|
|319|752|-0.02975612881029736|
+---+---+--------------------+
only showing top 5 rows

CPU times: user 2.24 ms, sys: 232 µs, total: 2.47 ms
Wall time: 18.4 s


In [98]:
%%time
cos_m.select('i').distinct().count()

CPU times: user 3.61 ms, sys: 0 ns, total: 3.61 ms
Wall time: 26 s


1682

In [99]:
%%time
cos_m.select('j').distinct().count()

CPU times: user 3.25 ms, sys: 0 ns, total: 3.25 ms
Wall time: 12.5 s


1682

Для каждого фильма, по которому у данного пользователя не стоит рейтинг, найдите:

[a] 30 ближайших фильмов-соседей для этого фильма (среди всех фильмов, а не фильмов, оценённых пользователем).

In [100]:
unrated_films.count()

1142

In [101]:
unrated_films.show(n=5)

+--------+--------------------+
|movie_id|         movie_title|
+--------+--------------------+
|       5|      Copycat (1995)|
|       6|Shanghai Triad (Y...|
|       8|         Babe (1995)|
|       9|Dead Man Walking ...|
|      14|  Postino, Il (1994)|
+--------+--------------------+
only showing top 5 rows



In [102]:
%%time
unrated_items_with_dist = unrated_films.join(
    cos_m, on=unrated_films['movie_id'] == cos_m['i']
)

CPU times: user 451 µs, sys: 941 µs, total: 1.39 ms
Wall time: 167 ms


In [103]:
%%time
unrated_items_with_dist.groupby('i').count().count()

CPU times: user 1.1 ms, sys: 1.84 ms, total: 2.93 ms
Wall time: 12.4 s


1142

In [104]:
from pyspark.sql.window import Window

In [105]:
from pyspark.sql.functions import rank, row_number

In [106]:
window = Window.partitionBy(
    unrated_items_with_dist['i']
).orderBy(
    unrated_items_with_dist['cos'].desc(),
    unrated_items_with_dist['j'].cast(IntegerType()).asc()
)

In [107]:
%%time
neighbour_items = unrated_items_with_dist.select('movie_title', 'i', 'j', 'cos', row_number().over(window).alias('#')) \
  .filter(col('#') < 31)

CPU times: user 831 µs, sys: 1.72 ms, total: 2.56 ms
Wall time: 32.3 ms


прогноз оценки пользователя по формуле (базовый предиктор из пункта 4, часть 3). Здесь S(i)- множество фильмов-соседей для фильма i, по которым у данного пользователя есть оценка. 

<img src="https://camo.githubusercontent.com/246d2acb23de63e3d19ef415eb3e1e9fb5717c8e/687474703a2f2f646174612e6e657770726f6c61622e636f6d2f7075626c69632d6e657770726f6c61622d636f6d2f6c6162613038735f6974656d5f6974656d5f63662e706e67" alt="Drawing" style="width: 300px;"/>

In [108]:
base_predictors_target = bu.filter(bu.user_id == target_id)

In [109]:
bu_target = base_predictors_target.take(1)[0].bu
bu_target

0.3288647272727238

In [110]:
bi2 = bi.toDF('id', 'bi')

In [111]:
bi2.show(n=5)

+----+--------------------+
|  id|                  bi|
+----+--------------------+
| 829| -0.4270513400199922|
|1436|-0.04295878652633463|
| 467| 0.16690701840272296|
| 691| 0.05546248651815944|
|1090| -0.5930556738185061|
+----+--------------------+
only showing top 5 rows



In [112]:
unbiased_ratings_target = unbiased_ratings.filter(unbiased_ratings['user_id'] == target_id)

In [113]:
%%time
unbiased_ratings_target.show(n=5)

+-------+-------+------+--------------------+------------------+
|item_id|user_id|rating|                   R|            R_norm|
+-------+-------+------+--------------------+------------------+
|    467|    450|     4|-0.02563174567544646| 5.623730725521247|
|    125|    450|     4|  0.1266474717889623|14.235703723304308|
|    451|    450|     4| 0.30270955408295785|14.735546959488957|
|    926|    450|     4|  0.7143827033904318| 9.496854770596856|
|     51|    450|     4| 0.19425981326141084| 9.633276880528523|
+-------+-------+------+--------------------+------------------+
only showing top 5 rows

CPU times: user 760 µs, sys: 1.37 ms, total: 2.13 ms
Wall time: 1.57 s


In [114]:
neighbour_items.show(n=5)

+-------------+----+----+------------------+---+
|  movie_title|   i|   j|               cos|  #|
+-------------+----+----+------------------+---+
|Sliver (1993)|1090|1292|0.3268461799143295|  1|
|Sliver (1993)|1090|1239|0.3079501997849308|  2|
|Sliver (1993)|1090| 759|0.2801750499768897|  3|
|Sliver (1993)|1090| 725|0.2749508927803329|  4|
|Sliver (1993)|1090| 397|0.2621255648643679|  5|
+-------------+----+----+------------------+---+
only showing top 5 rows



In [115]:
%%time
predict_item_base = neighbour_items.join(
    unbiased_ratings_target, on=neighbour_items['j'] == unbiased_ratings_target['item_id'] # add r[uj] - b[uj] for each neighbour
).join(
    bi2, on=neighbour_items['i'] == bi2.id
).select('i', 'j', 'cos', 'R', 'movie_title', (bi2.bi + mu + bu_target).alias('bui'))

CPU times: user 1.56 ms, sys: 3.18 ms, total: 4.74 ms
Wall time: 447 ms


In [116]:
predict_item_base.limit(10).toPandas()

Unnamed: 0,i,j,cos,R,movie_title,bui
0,1090,783,0.179511,-0.423888,Sliver (1993),3.265669
1,1090,734,0.240344,-1.416972,Sliver (1993),3.265669
2,1090,801,0.252078,0.388498,Sliver (1993),3.265669
3,1090,583,0.185779,0.407883,Sliver (1993),3.265669
4,1090,43,0.234634,0.433374,Sliver (1993),3.265669
5,1090,725,0.274951,-0.64857,Sliver (1993),3.265669
6,1436,1490,0.601438,-0.821264,Mr. Jones (1993),3.815766
7,1436,1479,0.309764,-0.729831,Mr. Jones (1993),3.815766
8,1436,1053,0.367971,-0.815396,Mr. Jones (1993),3.815766
9,1436,794,0.324642,1.103522,Mr. Jones (1993),3.815766


In [117]:
%%time
predict_item = predict_item_base.groupby('i').agg(
    (F.first('bui') + F.sum(col('cos') * col('R')) / F.sum( F.abs(col('cos')) )).alias('predict')
).join(u_item, on=predict_item_base.i == u_item.movie_id)

CPU times: user 6.07 ms, sys: 0 ns, total: 6.07 ms
Wall time: 620 ms


In [118]:
%%time
predict_item = predict_item.select('i', 'predict', col('movie_title').alias('title'))

CPU times: user 620 µs, sys: 456 µs, total: 1.08 ms
Wall time: 7.06 ms


In [119]:
%%time
predict_item.show(n=5)

+----+------------------+--------------------+
|   i|           predict|               title|
+----+------------------+--------------------+
|1090| 3.031933298995327|       Sliver (1993)|
|1436|3.4031201931052593|    Mr. Jones (1993)|
|1512| 4.762531114770054|World of Apu, The...|
| 675|3.6797664565015613|Nosferatu (Nosfer...|
| 691| 4.749241034266128|    Dark City (1998)|
+----+------------------+--------------------+
only showing top 5 rows

CPU times: user 775 µs, sys: 1.56 ms, total: 2.33 ms
Wall time: 14 s


Рекомендуйте пользователю 10 фильмов (predicators_top10) с самыми высокими оценками из пункта 3, часть 4.

In [120]:
%%time
predicators_top10 = predict_item.sort(predict_item.predict.desc()).limit(10)

CPU times: user 3.39 ms, sys: 0 ns, total: 3.39 ms
Wall time: 6.11 ms


In [121]:
%%time
item_item_prediction = predicators_top10.toPandas()
item_item_prediction

CPU times: user 5.26 ms, sys: 1.41 ms, total: 6.66 ms
Wall time: 13.3 s


In [122]:
user_user_advise

Unnamed: 0,movie_id,rating_prediction
0,1591,5.87963
1,1367,5.585312
2,1529,5.585312
3,19,5.342088
4,990,5.329101
5,1021,5.284317
6,916,5.280399
7,856,5.221958
8,888,5.155137
9,1137,5.155137


При подсчете прогноза по формуле из пункта 3, часть 4 отфильтруйте всех соседей с отрицательной близостью.

<img src="https://camo.githubusercontent.com/246d2acb23de63e3d19ef415eb3e1e9fb5717c8e/687474703a2f2f646174612e6e657770726f6c61622e636f6d2f7075626c69632d6e657770726f6c61622d636f6d2f6c6162613038735f6974656d5f6974656d5f63662e706e67" alt="Drawing" style="width: 300px;"/>

In [123]:
temp = predict_item_base.select('*', (F.when(col('cos') < 0.0, 0.0).otherwise(col('cos'))).alias('adj_cos'))

In [124]:
predict_item_positive = temp.groupby('i').agg(
    (F.first('bui') + F.sum( col('cos') * col('R')) / F.sum( F.abs(col('cos')) )).alias('predict')
).join(u_item, on=predict_item_base.i == u_item.movie_id)

In [125]:
predict_item_positive.show(n=5)

+----+------------------+--------+--------------------+
|   i|           predict|movie_id|         movie_title|
+----+------------------+--------+--------------------+
|1090| 3.031933298995327|    1090|       Sliver (1993)|
|1436|3.4031201931052597|    1436|    Mr. Jones (1993)|
|1512| 4.762531114770054|    1512|World of Apu, The...|
| 675|3.6797664565015613|     675|Nosferatu (Nosfer...|
| 691| 4.749241034266128|     691|    Dark City (1998)|
+----+------------------+--------+--------------------+
only showing top 5 rows



Рекомендуйте пользователю 10 фильмов (predicators_positive_top10) с самыми высокими оценками из пункта 5, часть 4.

In [126]:
predicators_positive_top10 = predict_item_positive.sort(predict_item_positive.predict.desc()).limit(10)

In [127]:
item_item_prediction_pos = predicators_positive_top10.toPandas()
item_item_prediction_pos

Unnamed: 0,i,predict,movie_id,movie_title
0,1194,5.252189,1194,Once Were Warriors (1994)
1,694,5.075911,694,Persuasion (1995)
2,1167,5.069466,1167,"Sum of Us, The (1994)"
3,1368,5.060906,1368,Mina Tannenbaum (1994)
4,48,5.048244,48,Hoop Dreams (1994)
5,361,5.029601,361,Incognito (1997)
6,915,5.024281,915,Primary Colors (1998)
7,408,4.991821,408,"Close Shave, A (1995)"
8,889,4.974996,889,"Tango Lesson, The (1997)"
9,793,4.96865,793,Crooklyn (1994)


In [128]:
pearson_top10.join(u_item, on='movie_id').toPandas()

Unnamed: 0,movie_id,rating_prediction,movie_title
0,19,5.342088,Antonia's Line (1995)
1,856,5.221958,Night on Earth (1991)
2,888,5.155137,One Night Stand (1997)
3,916,5.280399,Lost in Space (1998)
4,990,5.329101,Anna Karenina (1997)
5,1021,5.284317,8 1/2 (1963)
6,1137,5.155137,Beautiful Thing (1996)
7,1367,5.585312,Faust (1994)
8,1529,5.585312,Underground (1995)
9,1591,5.87963,Duoluo tianshi (1995)


In [129]:
ds = {
    'average_rating': mu,
    'predicators_positive_top10': [int(id) for id in item_item_prediction_pos.movie_id.tolist()],
    'predicators_top10': [int(id) for id in item_item_prediction.i.tolist()]
}

In [130]:
with open('../lab08s.json', 'w') as f:
    json.dump(ds, f)