In [1]:
import pandas as pd
from pyspark import SparkConf, SparkContext
from pyspark.sql.functions import when
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.sql import Row
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as F
from IPython.display import display

import matrix_factorization
import importlib
importlib.reload(matrix_factorization)
from matrix_factorization import *

In [None]:
#.config("spark.executor.memoryOverhead", "64g")\
spark = SparkSession\
    .builder\
    .appName("matrix-factorization")\
    .config("spark.executor.heartbeatInterval", "60s")\
    .config("spark.executor.memory", "32g")\
    .config("spark.driver.memory", "32g")\
    .config("spark.driver.maxResultSize", "64g")\
    .config("spark.sql.crossJoin.enabled", True)\
    .getOrCreate()
sc = spark.sparkContext

In [None]:
def to_pd(df, take=0):
    if take > 0:
        return pd.DataFrame(df.take(take), columns=df.columns)
    return pd.DataFrame(df.collect(), columns=df.columns)

In [None]:
mf = matrix_factorization(spark, sc)
mf.read_train("test_indexed/full_train_mf_indexed.tsv", is_preprocessed=True)
mf.full_train = mf.train
mf.read_train("test_indexed/train_mf_indexed.tsv", is_preprocessed=True)
mf.read_test("test_indexed/test_mf_indexed.tsv", is_preprocessed=True)

In [10]:
model = ALSModel.load("test_indexed/model_reply_rank_75")
predictions = model.transform(mf.test)
predictions.show()

+----+-----+-------+--------------------+--------------+----------------------+----------+
|like|reply|retweet|retweet_with_comment|tweet_id_index|engaging_user_id_index|prediction|
+----+-----+-------+--------------------+--------------+----------------------+----------+
|   0|    0|      0|                   0|          1580|              10325608|       0.0|
|   0|    0|      0|                   0|          2142|              13721555|       0.0|
|   0|    0|      0|                   0|          2142|               7785837|       0.0|
|   0|    0|      0|                   0|          3997|              18849816|       0.0|
|   0|    0|      0|                   0|          4935|               4009461|       0.0|
|   0|    0|      0|                   0|          7880|              12412775|       0.0|
|   0|    0|      0|                   0|         10817|              12684839|       0.0|
|   0|    0|      0|                   0|         11033|              20281415|       0.0|

In [None]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

# Generate top 10 movie recommendations for a specified set of users
users = df.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)
# Generate top 10 user recommendations for a specified set of movies
movies = df.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = model.recommendForItemSubset(movies, 10)