In [1]:
import pandas as pd
from pyspark import SparkConf, SparkContext
from pyspark.sql.functions import when
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.sql import Row
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from IPython.display import display

import twitter_preproc
import importlib
importlib.reload(twitter_preproc)
from twitter_preproc import *

In [2]:
conf = SparkConf().setAll([
    ("num-executors", 4), 
    ("total-executor-cores", 16), 
    ("executor-memory", "8g"),
    ("spark.yarn.executor.memoryOverhead", "64g")])
sc = SparkContext(appName = "spark-cl", conf=conf)
spark = SparkSession.builder.getOrCreate()

In [54]:
preproc = twitter_preproc(spark, sc, "///tmp/traintweet_10k.tsv", MF=True)
df = preproc.getDF()
pd.DataFrame(df.take(5), columns=df.columns)

Unnamed: 0,tweet_id,engaging_user_id,engaged_with_user_id,retweet_timestamp,reply_timestamp,retweet_with_comment_timestamp,like_timestamp
0,E7D6C5094767223F6F8789A87A1937AB,00000776B07587ECA9717BFC301F2D6E,D557B03872EF8986F7F4426AE094B2FE,,,,
1,129F4A868712BA2B98D31AF98C3066E4,00000B85AAF7DE172876FD96718C4469,424822AC982CE0E8965506C63B44EC12,1581498000.0,,,1581498000.0
2,04C6C2175852CDBBC23B2446C7E7C22D,00000E0C9B364891CDE89ECFC54771DE,1EC14E26417AA926095530AC591BA9CE,,,,1581061000.0
3,168157826315514C120494D4DF8E6216,00000F04EEDBCF3E1FB9A1948BF353B6,9B9595B6FEB8948BDDF0D222F27E0118,,,,1581329000.0
4,B3E3673782A69D9D8A45D3B222F0B073,000010088197DA00D659853E06935B3E,525DC99B7CB8F1AC4AD3E66C53FA38E0,,,,1580958000.0


In [55]:
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(df) for column in ["tweet_id", "engaging_user_id", "engaged_with_user_id"]]
pipeline = Pipeline(stages=indexers)
df = pipeline.fit(df).transform(df)
pd.DataFrame(df.take(5), columns=df.columns)

Unnamed: 0,tweet_id,engaging_user_id,engaged_with_user_id,retweet_timestamp,reply_timestamp,retweet_with_comment_timestamp,like_timestamp,tweet_id_index,engaging_user_id_index,engaged_with_user_id_index
0,E7D6C5094767223F6F8789A87A1937AB,00000776B07587ECA9717BFC301F2D6E,D557B03872EF8986F7F4426AE094B2FE,,,,,8141.0,5571.0,4453.0
1,129F4A868712BA2B98D31AF98C3066E4,00000B85AAF7DE172876FD96718C4469,424822AC982CE0E8965506C63B44EC12,1581498000.0,,,1581498000.0,7797.0,2663.0,7403.0
2,04C6C2175852CDBBC23B2446C7E7C22D,00000E0C9B364891CDE89ECFC54771DE,1EC14E26417AA926095530AC591BA9CE,,,,1581061000.0,5188.0,1639.0,5435.0
3,168157826315514C120494D4DF8E6216,00000F04EEDBCF3E1FB9A1948BF353B6,9B9595B6FEB8948BDDF0D222F27E0118,,,,1581329000.0,7121.0,5109.0,5812.0
4,B3E3673782A69D9D8A45D3B222F0B073,000010088197DA00D659853E06935B3E,525DC99B7CB8F1AC4AD3E66C53FA38E0,,,,1580958000.0,9308.0,1602.0,2904.0


In [56]:
df = df.withColumn("retweet", when(df["retweet_timestamp"].isNotNull(), 1))\
    .withColumn("reply", when(df["reply_timestamp"].isNotNull(), 1))\
    .withColumn("retweet_with_comment", when(df["retweet_with_comment_timestamp"].isNotNull(), 1))\
    .withColumn("like", when(df["like_timestamp"].isNotNull(), 1))
df = df.fillna(0, subset=["retweet", "reply", "retweet_with_comment", "like"])
pd.DataFrame(df.take(5), columns=df.columns)

Unnamed: 0,tweet_id,engaging_user_id,engaged_with_user_id,retweet_timestamp,reply_timestamp,retweet_with_comment_timestamp,like_timestamp,tweet_id_index,engaging_user_id_index,engaged_with_user_id_index,retweet,reply,retweet_with_comment,like
0,E7D6C5094767223F6F8789A87A1937AB,00000776B07587ECA9717BFC301F2D6E,D557B03872EF8986F7F4426AE094B2FE,,,,,8141.0,5571.0,4453.0,0,0,0,0
1,129F4A868712BA2B98D31AF98C3066E4,00000B85AAF7DE172876FD96718C4469,424822AC982CE0E8965506C63B44EC12,1581498000.0,,,1581498000.0,7797.0,2663.0,7403.0,1,0,0,1
2,04C6C2175852CDBBC23B2446C7E7C22D,00000E0C9B364891CDE89ECFC54771DE,1EC14E26417AA926095530AC591BA9CE,,,,1581061000.0,5188.0,1639.0,5435.0,0,0,0,1
3,168157826315514C120494D4DF8E6216,00000F04EEDBCF3E1FB9A1948BF353B6,9B9595B6FEB8948BDDF0D222F27E0118,,,,1581329000.0,7121.0,5109.0,5812.0,0,0,0,1
4,B3E3673782A69D9D8A45D3B222F0B073,000010088197DA00D659853E06935B3E,525DC99B7CB8F1AC4AD3E66C53FA38E0,,,,1580958000.0,9308.0,1602.0,2904.0,0,0,0,1


In [57]:
user_count = df.select("engaged_with_user_id_index")\
    .groupBy("engaged_with_user_id_index")\
    .agg({'engaged_with_user_id_index': 'count'})
user_count.filter(user_count["count(engaged_with_user_id_index)"] > 1).show()

+--------------------------+---------------------------------+
|engaged_with_user_id_index|count(engaged_with_user_id_index)|
+--------------------------+---------------------------------+
|                     305.0|                                2|
|                     496.0|                                2|
|                     299.0|                                2|
|                     147.0|                                2|
|                     170.0|                                2|
|                     184.0|                                2|
|                     169.0|                                2|
|                     160.0|                                2|
|                     379.0|                                2|
|                       8.0|                                7|
|                      67.0|                                3|
|                      70.0|                                3|
|                     486.0|                           

In [58]:
tweet_count = df.select("tweet_id_index")\
    .groupBy("tweet_id_index")\
    .agg({'tweet_id_index': 'count'})
tweet_count.filter(tweet_count["count(tweet_id_index)"] > 1).show()

+--------------+---------------------+
|tweet_id_index|count(tweet_id_index)|
+--------------+---------------------+
|           8.0|                    2|
|           0.0|                    6|
|           7.0|                    2|
|          18.0|                    2|
|           1.0|                    4|
|          25.0|                    2|
|           4.0|                    2|
|          23.0|                    2|
|          11.0|                    2|
|          21.0|                    2|
|          14.0|                    2|
|          22.0|                    2|
|          19.0|                    2|
|           3.0|                    2|
|           2.0|                    3|
|          17.0|                    2|
|          10.0|                    2|
|          13.0|                    2|
|           6.0|                    2|
|          20.0|                    2|
+--------------+---------------------+
only showing top 20 rows



In [31]:
(training, test) = df.randomSplit([0.8, 0.2])

In [32]:
pd.DataFrame(test.take(1), columns=test.columns)

Unnamed: 0,tweet_id,engaging_user_id,engaged_with_user_id,retweet_timestamp,reply_timestamp,retweet_with_comment_timestamp,like_timestamp,tweet_id_index,engaging_user_id_index,engaged_with_user_id_index,retweet,reply,retweet_with_comment,like
0,012F5BF74EE06D38FF1F728867070975,00016E8D8F1CF22A191FBF4056AC21C3,A457C46D206293BFFBD5C6B3F16A253A,,,,1581211912,313.0,1.0,617.0,0,0,0,1


In [34]:
rating_column = "retweet"
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="engaging_user_id_index", itemCol="tweet_id_index", ratingCol=rating_column,
          coldStartStrategy="nan", implicitPrefs=True)
model = als.fit(training)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol=rating_column,
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = nan


In [None]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

# Generate top 10 movie recommendations for a specified set of users
users = df.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)
# Generate top 10 user recommendations for a specified set of movies
movies = df.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = model.recommendForItemSubset(movies, 10)

In [None]:
pd.DataFrame(predictions.collect(), columns=predictions.columns)

In [20]:
pd.DataFrame(test.collect(), columns=test.columns)

Unnamed: 0,tweet_id,engaging_user_id,engaged_with_user_id,retweet_timestamp,reply_timestamp,retweet_with_comment_timestamp,like_timestamp,tweet_id_index,engaging_user_id_index,engaged_with_user_id_index
0,006A0A3DD3A1E4D5048439A746AB5D02,0005C541C84A20B49712BCB3BDAA8D15,36C732D507715E9423896CEAE2A1B218,0,0,0,0,464.0,729.0,439.0
1,012F5BF74EE06D38FF1F728867070975,00016E8D8F1CF22A191FBF4056AC21C3,A457C46D206293BFFBD5C6B3F16A253A,0,0,0,1581211912,313.0,1.0,617.0
2,022171134EBCA7337E5C2240CA9D385F,0007B37A7EBA8E4CF515CDA4BAF8D4C3,1F3E6C0DAFFAD3A98A644370A4C7951E,1581155037,0,0,1581155037,43.0,14.0,69.0
3,03DA2EBAD316B6ED489E54E6C1356F5C,0005BA0CC63217600DD9835FA036DC36,A8CBBFC9C15EC3ADCC4FA92B79013A43,0,0,0,1581125060,932.0,437.0,190.0
4,061FBB4B8DD7E16CF244165F14097020,000AECE2A829D40437422F4C852E6928,1F41012156691CEE20A0513A55077EF0,0,0,0,1581431216,975.0,93.0,216.0
5,06DBA610BC2439CABF813C63B7008C6C,000AC4DA9C84C09C4F837F652AEA27FB,0E9B8963EB691F819BCD3546FD45F32E,0,0,0,0,461.0,633.0,566.0
6,07D0F44E555DFF017A080B5BEAB66BE1,0007EDDCA88ECB52E9F6B23852BF37D9,FBB188A3C1E05C41587AAAC00B5B1879,0,0,0,0,93.0,739.0,120.0
7,08DC6F9BEC1296F056061D6C30D6304F,00070B6278D5B5657631898619492F9E,75CF426B74180D90DA07167CF04225FE,0,0,0,1580971528,855.0,65.0,138.0
8,09014F79993C0461C16B602F755E1A46,000304C82A3763234449875C63F567B2,2A4EAB2A611968C1178E08C98A00F6F8,0,0,0,0,971.0,280.0,242.0
9,0A6DDFCB1973BD7C30D6B425B915050A,0009A8084DEE3FA12A7F61584943090A,AE7F3DA35832DD67B71D74467CC355C4,0,0,0,1581434250,25.0,90.0,761.0
