### Binary Classifier on a single label using text_tokens as content

In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
import twitter_preproc

conf = SparkConf().setAll([
    ("num-executors", 4), 
    ("total-executor-cores", 16), 
    ("executor-memory", "8g"),
    ("spark.yarn.executor.memoryOverhead", "64g")])
sc = SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [2]:
datapath = "///tmp/traintweet_1000.tsv"
ENGAGEMENTS = ["like", "reply", "retweet", "retweet_with_comment"]

In [3]:
import importlib


importlib.reload(twitter_preproc)
preproc = twitter_preproc.twitter_preproc(spark, sc, datapath, MF=True)
df = preproc.getDF()

In [4]:
from pyspark.sql.types import *
from pyspark.sql.functions import when

for engagement in ENGAGEMENTS:
            df = df.withColumn(engagement, when(df[engagement + "_timestamp"].isNotNull(), 1).cast(ByteType()))\
                .drop(engagement + "_timestamp")

In [5]:
df = df.fillna(0, subset=ENGAGEMENTS)

In [6]:
df = df.select("text_tokens","tweet_id","engaging_user_id","like","reply","retweet","retweet_with_comment")

### Handle BERT tokens like words, maybe TODO: find deeper meaning in the tokens(embeddings)

In [7]:
from pyspark.ml.feature import RegexTokenizer,NGram,CountVectorizer,IDF,StringIndexer,Normalizer
from pyspark.ml import Pipeline

stringIndexer = StringIndexer(inputCol="engaging_user_id", outputCol="engaging_user_id_idx")
regexTokenizer = RegexTokenizer(inputCol="text_tokens", outputCol="terms", pattern="\t")
cv = CountVectorizer(inputCol="terms", outputCol="vector")
idf = IDF(inputCol="vector", outputCol="features")
normalizer=Normalizer(inputCol="features",outputCol="normed_features")
pipeline = Pipeline(stages=[stringIndexer,regexTokenizer, cv,idf,normalizer])

model = pipeline.fit(df)


In [8]:
data = model.transform(df)


In [9]:
data.printSchema()

root
 |-- text_tokens: string (nullable = true)
 |-- tweet_id: string (nullable = true)
 |-- engaging_user_id: string (nullable = true)
 |-- like: byte (nullable = true)
 |-- reply: byte (nullable = true)
 |-- retweet: byte (nullable = true)
 |-- retweet_with_comment: byte (nullable = true)
 |-- engaging_user_id_idx: double (nullable = false)
 |-- terms: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- vector: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- normed_features: vector (nullable = true)



# Relevance Feedback with Rocchios method 
7, S.36

for now with a single user and selected tweeet yeet

In [10]:
import scipy.sparse as sps

# Returns the positive/negative feedback from a 
def build_user_profile(df,user_id):  
    user_vectors = df.where(df["engaging_user_id_idx"] == user_id)
    
    pos = user_vectors.where(user_vectors["like"] == 1)
    neg = user_vectors.where(user_vectors["like"] == 0)
    
    pos_rdd = pos[["engaging_user_id_idx","normed_features"]].rdd
    neg_rdd = neg[["engaging_user_id_idx","normed_features"]].rdd
    
    pos_count = pos_rdd.count()
    neg_count = neg_rdd.count()
    
    positive_weight = 1
    negative_weight = 1
   
    if(pos_count > 0  & neg_count > 0):
        pos_agg = pos_rdd.map(lambda row: (row.engaging_user_id_idx,sps.csc_matrix(row.normed_features))).reduceByKey(lambda k,v:k+v)
        neg_agg = neg_rdd.map(lambda row: (row.engaging_user_id_idx,sps.csc_matrix(row.normed_features))).reduceByKey(lambda k,v:k+v)
        user_profile = positive_weight * (1/pos_count * pos_agg.collect()[0][1]) - negative_weight * (1 / neg_count) * neg_agg.collect()[0][1] 
    elif(pos_count > 0  & neg_count == 0):
        pos_agg = pos_rdd.map(lambda row: (row.engaging_user_id_idx,sps.csc_matrix(row.normed_features))).reduceByKey(lambda k,v:k+v)
        user_profile = positive_weight * (1/pos_count * pos_agg.collect()[0][1])
      
    return user_profile

In [11]:
u = build_user_profile(data,1.0)

In [12]:
tweets = data.select("tweet_id","normed_features").rdd
tweet_features = tweets.map(lambda row: (row.tweet_id,sps.csc_matrix(row.normed_features)))

In [35]:
from scipy.spatial.distance import cosine

t = tweet_features.map(lambda t: (t[0],cosine(t[1].toarray(),u.toarray())))

In [36]:
t.take(5)

[('E7D6C5094767223F6F8789A87A1937AB', 1.0),
 ('129F4A868712BA2B98D31AF98C3066E4', 0.9931761850625865),
 ('04C6C2175852CDBBC23B2446C7E7C22D', 0.9840771707836296),
 ('168157826315514C120494D4DF8E6216', 0.9871904330427985),
 ('B3E3673782A69D9D8A45D3B222F0B073', 0.9899149407959815)]

In [68]:
def recommend(df,tweet_id,user_id):
    u = build_user_profile(df,user_id)
    
    t = df.where(df["tweet_id"] == tweet_id)
    t = t.select("normed_features")
    t = sps.csc_matrix(t.first()[0])
    
    return cosine(u.toarray(),t.toarray())

In [71]:
recommend(data,"168157826315514C120494D4DF8E6216",1.0)

0.9871904330427985

# TODO: Convert cosine distance to probability somehow?

In [72]:
sc.stop()

# Content Based approach

Our approach was to treat the BERT tokens like words and generate an id-idf feature vector. Every tweet is represented as a sparse feature vector, the user profile is generated with every tweet which the user has engaged with. 

We tried to use Rocchio's Method to generate relevance feedback for a user. This method was not feasable, due to the massive amount of features. For Rocchio's Method past rated items are split into two classes, positive feedback and negative feedback. To compute the user profile, one has to aggregate each feature vector from the positive and negative ones. This step was crucial for performance, because the aggreation of a sparse vector with ~32.000 features did not finish in a reasonable time.

## Vector aggregation in pyspark

It is neccessary in pyspark to transform a SparseVector into an intermediate format, in this case an array, to perform aggregate functions. So for the linear combination of our user feedback, we had to aggregate arrays instead of SparseVectors, which is also a factor for the poor performance. 
