# Content based recommender on the twitter dataset

We used rocchios algorithm to define the preferences of a user (profile). The user profile is a single feature vector, this vector will be compared with a new observation to determine the cosine similarity. We normalize the cosine similarity with (sim +1)/2 to map it into a range of 0..1. The result is technically not the cosine similarity, but still an indicator if a user likes or dislikes the item.
The resulting metric ranges from total negativity(0) to total positivity (1) with 0.5 indicating total independence.



In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
import twitter_preproc

conf = SparkConf().setAll([
    ("num-executors", 4), 
    ("total-executor-cores", 32), 
    ("executor-memory", "8g"),
    ("spark.yarn.executor.memoryOverhead", "64g")])
sc = SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

# Relevance Feedback with Rocchios method 

https://en.wikipedia.org/wiki/Rocchio_algorithm

1. Get every tweet with which a user has interacted
2. Split these positive/negativ ones (based on the user interaction).
3. Aggregate and average the pos/neg vectors and normalize them 1/N+-
5. Multiply each with weights
6. Substract the negative feedback from the positive feedback to get a single vector representing the preference of a user

This happens in class content_based, in the method generate_user_profile

In [5]:
from pyspark.sql.types import *
from pyspark.sql.functions import when
from pyspark.ml.feature import RegexTokenizer,NGram,CountVectorizer,IDF,StringIndexer,Normalizer
from pyspark.ml import Pipeline
from pyspark.ml.linalg import VectorUDT
from pyspark.ml.linalg import DenseVector
from pyspark.ml.linalg import SparseVector
from pyspark.sql import Row
from pyspark.sql.functions import udf
import scipy.sparse as sps
from scipy.spatial.distance import cosine
import pandas as pd


class content_based:
    def __init__(self, spark: SparkSession, sc: SparkContext):
        self.spark = spark
        self.sc = sc

    
    '''
    This is a helper method to read a csv file and preprocess using a spark pipeline.
    The main purpose in this method is to transform the column text_tokens into an ID-IDF feature vector
    This method gets called to process the training set, and the test set in an extra step
    '''
    def get_processed_data(self,datapath):        
        SCHEMA = StructType([
            StructField("text_tokens", StringType()),
            StructField("hashtags", StringType()),
            StructField("tweet_id", StringType()),
            StructField("present_media", StringType()),
            StructField("present_links", StringType()),
            StructField("present_domains", StringType()),
            StructField("tweet_type", StringType()),
            StructField("language", StringType()),
            StructField("tweet_timestamp", LongType()),
            StructField("engaged_with_user_id", StringType()),
            StructField("engaged_with_user_follower_count", LongType()),
            StructField("engaged_with_user_following_count", LongType()),
            StructField("engaged_with_user_is_verified", BooleanType()),
            StructField("engaged_with_user_account_creation", LongType()),
            StructField("engaging_user_id", StringType()),
            StructField("engaging_user_follower_count", LongType()),
            StructField("engaging_user_following_count", LongType()),
            StructField("engaging_user_is_verified", BooleanType()),
            StructField("engaging_user_account_creation", LongType()),
            StructField("engaged_follows_engaging", BooleanType()),
            StructField("reply_timestamp", LongType()),
            StructField("retweet_timestamp", LongType()),
            StructField("retweet_with_comment_timestamp", LongType()),
            StructField("like_timestamp", LongType())       
        ])
        
        raw = spark.read.csv(path=datapath, sep="\x01", header=False, schema=SCHEMA)
        
        df = raw.select(["tweet_id","engaging_user_id",
                                            "retweet_timestamp","reply_timestamp",
                                            "retweet_with_comment_timestamp","like_timestamp","text_tokens"])

        for engagement in ENGAGEMENTS:
                    df = df.withColumn(engagement, when(df[engagement + "_timestamp"].isNotNull(), 1).cast(ByteType()))\
                        .drop(engagement + "_timestamp")

        df = df.fillna(0, subset=ENGAGEMENTS)

        #stringIndexer = StringIndexer(inputCol="engaging_user_id", outputCol="engaging_user_id_idx")
        regexTokenizer = RegexTokenizer(inputCol="text_tokens", outputCol="terms", pattern="\t")
        cv = CountVectorizer(inputCol="terms", outputCol="vector")
        idf = IDF(inputCol="vector", outputCol="features")
        normalizer=Normalizer(inputCol="features",outputCol="normed_features")
        pipeline = Pipeline(stages=[regexTokenizer, cv,idf,normalizer])

        model = pipeline.fit(df)
        data = model.transform(df)
        
        data = data.select("normed_features","tweet_id","engaging_user_id","like","reply","retweet","retweet_with_comment")

        return data
    
    '''
    call this method outside to set the two private variables for training and test
    '''
    def set_train_test_val(self,trainpath,testpath,valpath):
        self.data = self.get_processed_data(trainpath)
        self.test = self.get_processed_data(testpath)
        self.val = self.get_processed_data(valpath)
        
    
    # 
    # 
    '''
    https://en.wikipedia.org/wiki/Rocchio_algorithm
    This method is only called in the class method get_predictions
    Returns a single feature vector representing the users preferences for a single engagement
    Gets eventually called 4 times, for each engagement once.
    '''
    def generate_user_profile(self,engagement):
        # transform PySpark Sparse Vectors into scipy csr matrices and generate a paired RDD in form: key:(user_id,0|1 for engagement) value(csr_matrices)
        tf = self.data.rdd.map(lambda row: ((row.engaging_user_id,row[engagement]),sps.csr_matrix(row.normed_features)))
        
        # we saved the key as (user_id,0|1), so we perform a mapreduce on it to generate a linear combination of the positive/negative interaction
        # the result from this operation is a RDD, in which each user is 2-times in the set, one row for the linear combination of positives and one row for the negatives
        
        # How to aggreagate and average 
        # https://stackoverflow.com/questions/29930110/calculating-the-averages-for-each-key-in-a-pairwise-k-v-rdd-in-spark-with-pyth
        aTuple = (0,0)
        aggregated = tf.aggregateByKey(aTuple,
                                lambda a,b: (a[0] + b,    a[1] + 1),
                                lambda a,b: (a[0] + b[0], a[1] + b[1]))
        user_vectors = aggregated.mapValues(lambda v: (v[1],v[0]/v[1]))
        
        # Here we perform the substraction Positive - Negative feedback. The if else lambda is to multiply the negative feedback vector with -1
        # with negative values in the negative feedback we can use the associative + operator to perform a substraction
        
        #tup[0][0] is user_id
        #tup[0][1] is the engagement
        #tup[1][0] is the count
        #tup[1][1] is the feature vectors
        
        user_profiles = user_vectors.map(lambda tup:(tup[0][0],tup[1][1].multiply(-1)) if tup[0][1] == 0 else (tup[0][0],tup[1][1])).reduceByKey(lambda accumulator,value: accumulator + value)
        
        
        # First we transform the paired RDD into an Row RDD, than to a DataFrame, which is the final output of this method
        # the dataframe holds distinct user values, each user holds a single feature vector representing the preferences for a single engagement
        
        user_profiles_df = user_profiles.map(lambda tup: Row(user_id=tup[0],user_profile = SparseVector(tup[1].shape[1],tup[1].indices,tup[1].data))).toDF()
        return user_profiles_df
    
    '''
    Get an RDD in the form user_id,tweet_id,probability for an engagement
    '''
    def get_predictions(self,engagement):       
        user_profile = self.generate_user_profile(engagement)
        joined = self.test.join(user_profile,self.test.engaging_user_id==user_profile.user_id)
        # We normalize the cosine similarity to map into a range of 0 and 1, (sim + 1) / 2
        predictions = joined.rdd.map(lambda row: Row(tweet_id=row.tweet_id,user_id=row.engaging_user_id,probability = ((1.0 - cosine(row.user_profile,row.normed_features).item()) + 1) / 2))
        
        return predictions.toPandas()
            

In [6]:
cb = content_based(spark,sc)

datapath = "///tmp/traintweet_1000.tsv"
ENGAGEMENTS = ["like", "reply", "retweet", "retweet_with_comment"]

train = "///tmp/supersecret_train40k.tsv"
testfile = "///tmp/supersecret_test5k.tsv"
valfile = "///tmp/supersecret_ensembletrain5k.tsv"

cb.set_train_test_val(train,testfile,valfile)

In [None]:
like = cb.get_predictions("like")
like.head()

In [63]:
like = cb.get_predictions("like")
reply = cb.get_predictions("reply")
retweet = cb.get_predictions("retweet")
retweet_with_comment = cb.get_predictions("retweet_with_comment")

In [75]:
n = 10
like.take(n)

[]

In [64]:
like.count()

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: 
Aborting TaskSet 259.0 because task 2 (partition 2)
cannot run anywhere due to node and executor blacklist.
Most recent failure:
Lost task 2.1 in stage 259.0 (TID 579, c106.local, executor 19): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/cloudera/parcels/CDH-6.3.2-1.cdh6.3.2.p0.1605554/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 372, in main
    process()
  File "/opt/cloudera/parcels/CDH-6.3.2-1.cdh6.3.2.p0.1605554/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 367, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/home/anaconda3/lib/python3.6/site-packages/pyspark/rdd.py", line 2499, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/home/anaconda3/lib/python3.6/site-packages/pyspark/rdd.py", line 2499, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/home/anaconda3/lib/python3.6/site-packages/pyspark/rdd.py", line 2499, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/home/anaconda3/lib/python3.6/site-packages/pyspark/rdd.py", line 352, in func
    return f(iterator)
  File "/home/anaconda3/lib/python3.6/site-packages/pyspark/rdd.py", line 1055, in <lambda>
    return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
  File "/home/anaconda3/lib/python3.6/site-packages/pyspark/rdd.py", line 1055, in <genexpr>
    return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
  File "/opt/cloudera/parcels/CDH-6.3.2-1.cdh6.3.2.p0.1605554/lib/spark/python/lib/pyspark.zip/pyspark/util.py", line 99, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-59-e4abc41acf6f>", line 135, in <lambda>
  File "/home/anaconda3/lib/python3.6/site-packages/scipy/spatial/distance.py", line 744, in cosine
    return correlation(u, v, w=w, centered=False)
  File "/home/anaconda3/lib/python3.6/site-packages/scipy/spatial/distance.py", line 695, in correlation
    uv = np.average(u * v, weights=w)
TypeError: unsupported operand type(s) for *: 'NoneType' and 'float'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:592)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:575)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$class.foreach(Iterator.scala:891)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:945)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:945)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2121)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2121)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$11.apply(Executor.scala:407)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1408)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:413)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)


Blacklisting behavior can be configured via spark.blacklist.*.

	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1890)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:929)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:929)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:929)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2111)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2049)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:740)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2081)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2102)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2121)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2146)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:945)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:944)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:166)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [51]:
reply.take(n)

[Row(cosine_similarity=0.1412570523122424, tweet_id='8CE7533028B0EAEF85EBC4AE37868F91', user_id='00007F67532643E55C33DA326FE0FB5C'),
 Row(cosine_similarity=0.14125705231224228, tweet_id='B3237445D357A16D4637EE178FC7E68C', user_id='00007F67532643E55C33DA326FE0FB5C'),
 Row(cosine_similarity=0.0, tweet_id='2705449E2CB2E919540B3CCA84286ABE', user_id='00027B86A4118D3249F9352272D53FC1'),
 Row(cosine_similarity=0.1444346754025927, tweet_id='CD1906D4F3BEC63ECC9547EB1A5523FA', user_id='0002A3D7A02C1D269FD136500342C274'),
 Row(cosine_similarity=0.1444346754025927, tweet_id='F10E40C008613460549839B51703BCC0', user_id='0002A3D7A02C1D269FD136500342C274'),
 Row(cosine_similarity=0.20699139018985058, tweet_id='1386298FAE581E8291C62725AF1C0288', user_id='0002B7B662BC907411052C8884FB347C'),
 Row(cosine_similarity=0.20247374243819183, tweet_id='37C1FFDF075AD37F81AF15119B446DF9', user_id='0002B7B662BC907411052C8884FB347C'),
 Row(cosine_similarity=0.19322785637243367, tweet_id='3D6D0A8601D965D38E54BD4B742

In [52]:
retweet.take(n)

[Row(cosine_similarity=0.1412570523122424, tweet_id='8CE7533028B0EAEF85EBC4AE37868F91', user_id='00007F67532643E55C33DA326FE0FB5C'),
 Row(cosine_similarity=0.14125705231224228, tweet_id='B3237445D357A16D4637EE178FC7E68C', user_id='00007F67532643E55C33DA326FE0FB5C'),
 Row(cosine_similarity=0.0, tweet_id='2705449E2CB2E919540B3CCA84286ABE', user_id='00027B86A4118D3249F9352272D53FC1'),
 Row(cosine_similarity=0.14847005825400816, tweet_id='CD1906D4F3BEC63ECC9547EB1A5523FA', user_id='0002A3D7A02C1D269FD136500342C274'),
 Row(cosine_similarity=0.8515299417459918, tweet_id='F10E40C008613460549839B51703BCC0', user_id='0002A3D7A02C1D269FD136500342C274'),
 Row(cosine_similarity=0.20699139018985058, tweet_id='1386298FAE581E8291C62725AF1C0288', user_id='0002B7B662BC907411052C8884FB347C'),
 Row(cosine_similarity=0.20247374243819183, tweet_id='37C1FFDF075AD37F81AF15119B446DF9', user_id='0002B7B662BC907411052C8884FB347C'),
 Row(cosine_similarity=0.19322785637243367, tweet_id='3D6D0A8601D965D38E54BD4B74

In [53]:
retweet_with_comment.take(n)

[Row(cosine_similarity=0.1412570523122424, tweet_id='8CE7533028B0EAEF85EBC4AE37868F91', user_id='00007F67532643E55C33DA326FE0FB5C'),
 Row(cosine_similarity=0.14125705231224228, tweet_id='B3237445D357A16D4637EE178FC7E68C', user_id='00007F67532643E55C33DA326FE0FB5C'),
 Row(cosine_similarity=0.0, tweet_id='2705449E2CB2E919540B3CCA84286ABE', user_id='00027B86A4118D3249F9352272D53FC1'),
 Row(cosine_similarity=0.1444346754025927, tweet_id='CD1906D4F3BEC63ECC9547EB1A5523FA', user_id='0002A3D7A02C1D269FD136500342C274'),
 Row(cosine_similarity=0.1444346754025927, tweet_id='F10E40C008613460549839B51703BCC0', user_id='0002A3D7A02C1D269FD136500342C274'),
 Row(cosine_similarity=0.20699139018985058, tweet_id='1386298FAE581E8291C62725AF1C0288', user_id='0002B7B662BC907411052C8884FB347C'),
 Row(cosine_similarity=0.20247374243819183, tweet_id='37C1FFDF075AD37F81AF15119B446DF9', user_id='0002B7B662BC907411052C8884FB347C'),
 Row(cosine_similarity=0.19322785637243367, tweet_id='3D6D0A8601D965D38E54BD4B742

In [None]:
def toCSVLine(data):
    return ','.join(str(d)for d in data)

lines = like.map(toCSVLine)
lines.saveAsTextFile("like.csv")