# Matrix Factorization

In [1]:
import findspark
findspark.init("/usr/spark-2.4.1")
import pyspark
from pyspark import SQLContext

pyspark.SparkContext.setSystemProperty('spark.executor.memory', '14g')
sc = pyspark.SparkContext()
sql = SQLContext(sc)

In [2]:
datafile = "data/training_sample.tsv"

df = (sql.read
    .format("csv")
    .option("header", "false")
    .option("sep", "\x01")
    .load(datafile,  inferSchema="true")
    .toDF("text_tokens", "hashtags", "tweet_id", "present_media", "present_links", "present_domains","tweet_type", "language", "tweet_timestamp", "engaged_with_user_id", "engaged_with_user_follower_count","engaged_with_user_following_count", "engaged_with_user_is_verified", "engaged_with_user_account_creation",\
               "engaging_user_id", "engaging_user_follower_count", "engaging_user_following_count", "engaging_user_is_verified","engaging_user_account_creation", "engaged_follows_engaging", "reply_timestamp", "retweet_timestamp", "retweet_with_comment_timestamp", "like_timestamp"))

In [3]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer


user_indexer = StringIndexer(inputCol="engaging_user_id", outputCol="user")
tweet_indexer = StringIndexer(inputCol="tweet_id", outputCol="tweet")

pipeline = Pipeline(stages=[user_indexer, tweet_indexer])
df = pipeline.fit(df).transform(df)

In [18]:
from pyspark.sql.functions import  when, col

def encode_response(x):
    return when(col(x).isNull(), float(0)).otherwise(float(1))

def implicit_feedback(creation_time, interaction_time):
    return when(col(interaction_time).isNull(), float(0)).otherwise(col(interaction_time)-col(creation_time))

df = df.withColumn("like", encode_response("like_timestamp"))

data = df.select("user", "tweet", "like")

In [19]:
(training, test) = data.randomSplit([0.8, 0.2])

In [20]:
training.head(15)

[Row(user=0.0, tweet=894.0, like=1.0),
 Row(user=0.0, tweet=12672.0, like=1.0),
 Row(user=0.0, tweet=21419.0, like=1.0),
 Row(user=0.0, tweet=28349.0, like=1.0),
 Row(user=0.0, tweet=63325.0, like=1.0),
 Row(user=0.0, tweet=71604.0, like=1.0),
 Row(user=3.0, tweet=5885.0, like=0.0),
 Row(user=3.0, tweet=9583.0, like=0.0),
 Row(user=3.0, tweet=32968.0, like=0.0),
 Row(user=3.0, tweet=35854.0, like=0.0),
 Row(user=3.0, tweet=70495.0, like=0.0),
 Row(user=4.0, tweet=4665.0, like=1.0),
 Row(user=4.0, tweet=37107.0, like=1.0),
 Row(user=4.0, tweet=46505.0, like=1.0),
 Row(user=4.0, tweet=59001.0, like=1.0)]

In [65]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

als = ALS(maxIter=10, regParam=0.01, rank=20, 
          userCol="user", itemCol="tweet", ratingCol="like",
          coldStartStrategy="drop", implicitPrefs=True)
model = als.fit(training)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)

In [66]:
predictions.take(10)

[Row(user=869.0, tweet=1088.0, like=1.0, prediction=-2.731183030846296e-06),
 Row(user=483.0, tweet=243.0, like=1.0, prediction=-3.378973142953434e-11),
 Row(user=4047.0, tweet=392.0, like=0.0, prediction=5.73888048996389e-17),
 Row(user=94.0, tweet=737.0, like=1.0, prediction=-0.0006277129286900163),
 Row(user=36.0, tweet=897.0, like=1.0, prediction=0.0),
 Row(user=1238.0, tweet=31.0, like=1.0, prediction=3.450740848620626e-07),
 Row(user=2048.0, tweet=516.0, like=0.0, prediction=0.0),
 Row(user=504.0, tweet=1139.0, like=1.0, prediction=0.0),
 Row(user=6155.0, tweet=85.0, like=1.0, prediction=0.0),
 Row(user=598.0, tweet=85.0, like=1.0, prediction=5.024946929908791e-13)]

In [69]:
predictionAndLabels = predictions.rdd.map(lambda r: (r.prediction, r.like))

In [70]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics

# Instantiate metrics object
metrics = BinaryClassificationMetrics(predictionAndLabels)

# Area under precision-recall curve
print("Area under PR = %s" % metrics.areaUnderPR)

Area under PR = 0.745748752782995


In [67]:
def convert_to_class(x, threshold):
    return when(col(x) <= threshold, float(0)).otherwise(float(1))

predictions = predictions.withColumn("prediction_class", convert_to_class("prediction", 0))

In [68]:
predictions.take(10)

[Row(user=869.0, tweet=1088.0, like=1.0, prediction=-2.731183030846296e-06, prediction_class=0.0),
 Row(user=483.0, tweet=243.0, like=1.0, prediction=-3.378973142953434e-11, prediction_class=0.0),
 Row(user=4047.0, tweet=392.0, like=0.0, prediction=5.73888048996389e-17, prediction_class=1.0),
 Row(user=94.0, tweet=737.0, like=1.0, prediction=-0.0006277129286900163, prediction_class=0.0),
 Row(user=36.0, tweet=897.0, like=1.0, prediction=0.0, prediction_class=0.0),
 Row(user=1238.0, tweet=31.0, like=1.0, prediction=3.450740848620626e-07, prediction_class=1.0),
 Row(user=2048.0, tweet=516.0, like=0.0, prediction=0.0, prediction_class=0.0),
 Row(user=504.0, tweet=1139.0, like=1.0, prediction=0.0, prediction_class=0.0),
 Row(user=6155.0, tweet=85.0, like=1.0, prediction=0.0, prediction_class=0.0),
 Row(user=598.0, tweet=85.0, like=1.0, prediction=5.024946929908791e-13, prediction_class=1.0)]