In [48]:
import pandas as pd
from pyspark import SparkConf, SparkContext
from pyspark.sql.functions import when
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.sql import Row
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from IPython.display import display

import train_test_split
import importlib
importlib.reload(train_test_split)
from train_test_split import *

In [2]:
spark = SparkSession\
    .builder\
    .appName("matrix-factorization")\
    .config("spark.executor.memoryOverhead", "64g")\
    .config("spark.executor.heartbeatInterval", "60s")\
    .config("spark.executor.instances", 50)\
    .config("spark.executor.cores", 4)\
    .config("executor-memory", "8g")\
    .config("spark.driver.maxResultSize", "64g")\
    .config("spark.sql.crossJoin.enabled", True)\
    .getOrCreate()
sc = spark.sparkContext

In [3]:
def to_pd(df, take=0):
    if take > 0:
        return pd.DataFrame(df.take(take), columns=df.columns)
    return pd.DataFrame(df.collect(), columns=df.columns)

In [49]:
split = train_test_split(spark, sc)
split.read_preprocessed("training_mf_indexed_1000.tsv")
df = split.full_df
to_pd(df, 5)

Unnamed: 0,engaging_user_id_index,tweet_id_index,engaged_with_user_id,retweet_timestamp,reply_timestamp,retweet_with_comment_timestamp,like_timestamp,engaging_user_id,tweet_id,engaged_with_user_id_index,like,reply,retweet,retweet_with_comment
0,29,29,,,,,,,,,0,0,0,0
1,29,26,,,,,,,,,0,0,0,0
2,29,474,,,,,,,,,0,0,0,0
3,29,964,,,,,,,,,0,0,0,0
4,29,191,,,,,,,,,0,0,0,0


In [50]:
split.get_all_train_test(split.full_df)
engagement = "retweet"
(training, test) = split.train[engagement], split.test[engagement]

In [51]:
training.count(), test.count()

(471836, 30)

In [52]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="engaging_user_id_index", itemCol="tweet_id_index", ratingCol=engagement,
          coldStartStrategy="drop", implicitPrefs=True)
model = als.fit(training)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol=engagement, predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.816496580927726


In [53]:
predictions.select("retweet").groupBy("retweet").agg(F.count("retweet")).show()

+-------+--------------+
|retweet|count(retweet)|
+-------+--------------+
|      1|            20|
|      0|            10|
+-------+--------------+



In [None]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

# Generate top 10 movie recommendations for a specified set of users
users = df.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)
# Generate top 10 user recommendations for a specified set of movies
movies = df.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = model.recommendForItemSubset(movies, 10)