### Binary Classifier on a single label using text_tokens as content

In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
import twitter_preproc

conf = SparkConf().setAll([
    ("num-executors", 4), 
    ("total-executor-cores", 16), 
    ("executor-memory", "8g"),
    ("spark.yarn.executor.memoryOverhead", "64g")])
sc = SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [2]:
datapath = "///tmp/traintweet_10k.tsv"
ENGAGEMENTS = ["like", "reply", "retweet", "retweet_with_comment"]

Load DF and change Timestamp/None to 1/0 in target column

In [18]:
import importlib
from pyspark.sql.functions import when

importlib.reload(twitter_preproc)
preproc = twitter_preproc.twitter_preproc(spark, sc, datapath, MF=True)
df = preproc.getDF()

In [19]:
from pyspark.sql.types import *

for engagement in ENGAGEMENTS:
            df = df.withColumn(engagement, when(df[engagement + "_timestamp"].isNotNull(), 1).cast(ByteType()))\
                .drop(engagement + "_timestamp")

In [20]:
df = df.fillna(0, subset=ENGAGEMENTS)

In [22]:
df = df.select("text_tokens","tweet_id","engaging_user_id","like","reply","retweet","retweet_with_comment")

### Handle BERT tokens like words, maybe TODO: find deeper meaning in the tokens(embeddings)

In [28]:
from pyspark.ml.feature import RegexTokenizer,NGram,CountVectorizer,IDF,StringIndexer,Normalizer
from pyspark.ml import Pipeline

stringIndexer = StringIndexer(inputCol="engaging_user_id", outputCol="engaging_user_id_idx")
regexTokenizer = RegexTokenizer(inputCol="text_tokens", outputCol="terms", pattern="\t")
cv = CountVectorizer(inputCol="terms", outputCol="vector")
idf = IDF(inputCol="vector", outputCol="features")
normalizer=Normalizer(inputCol="features",outputCol="normed_features")
pipeline = Pipeline(stages=[stringIndexer,regexTokenizer, cv,idf,normalizer])

model = pipeline.fit(df)


In [29]:
data = model.transform(df)


In [30]:
data.printSchema()

root
 |-- text_tokens: string (nullable = true)
 |-- tweet_id: string (nullable = true)
 |-- engaging_user_id: string (nullable = true)
 |-- like: byte (nullable = true)
 |-- reply: byte (nullable = true)
 |-- retweet: byte (nullable = true)
 |-- retweet_with_comment: byte (nullable = true)
 |-- engaging_user_id_idx: double (nullable = false)
 |-- terms: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- vector: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- normed_features: vector (nullable = true)



# Relevance Feedback with Rocchios method 
7, S.36

for now with a single user and selected tweeet yeet

In [31]:
tweet_id = "E7D6C5094767223F6F8789A87A1937AB"
user_id = 1.0
features = "normed_features"
target = "like_timestamp"

tweet_vector = data.where(data["tweet_id"] == tweet_id)
user_vectors = data.where(data["engaging_user_id_idx"] == user_id)

In [33]:
user_vectors.select("normed_features","like","reply","retweet","retweet_with_comment").show(20)

+--------------------+----+-----+-------+--------------------+
|     normed_features|like|reply|retweet|retweet_with_comment|
+--------------------+----+-----+-------+--------------------+
|(31642,[3,4,5,7,1...|   1|    0|      0|                   0|
|(31642,[0,1,2,3,4...|   1|    0|      0|                   0|
|(31642,[0,1,3,4,5...|   1|    0|      0|                   0|
|(31642,[0,1,2,3,4...|   1|    0|      0|                   0|
|(31642,[0,1,2,3,4...|   1|    0|      0|                   0|
|(31642,[0,1,2,3,4...|   1|    0|      0|                   0|
|(31642,[0,1,2,3,4...|   1|    0|      0|                   0|
|(31642,[0,1,2,3,4...|   1|    0|      0|                   0|
|(31642,[0,3,4,5,7...|   1|    0|      0|                   0|
+--------------------+----+-----+-------+--------------------+



## Calculate linear combination of user_vectors

In [37]:
positive_feedback = user_vectors.where(user_vectors["like"].isNotNull())
negative_feedback = user_vectors.where(user_vectors["like"].isNull())

In [38]:
positive_feedback.select("normed_features").show()

+--------------------+
|     normed_features|
+--------------------+
|(31642,[3,4,5,7,1...|
|(31642,[0,1,2,3,4...|
|(31642,[0,1,3,4,5...|
|(31642,[0,1,2,3,4...|
|(31642,[0,1,2,3,4...|
|(31642,[0,1,2,3,4...|
|(31642,[0,1,2,3,4...|
|(31642,[0,1,2,3,4...|
|(31642,[0,3,4,5,7...|
+--------------------+



In [39]:
negative_feedback.select("normed_features").show()

+---------------+
|normed_features|
+---------------+
+---------------+



In [None]:
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql import functions as F
from pyspark.sql import types as T

def vec2array(v):
  v = Vectors.dense(v)
  array = list([float(x) for x in v])
  return array

vec2array_udf = F.udf(vec2array, T.ArrayType(T.FloatType()))

user_profile = user_vectors.withColumn('normed_features', vec2array_udf('normed_features'))

n = len(user_profile.select('normed_features').first()[0])
d_plus = user_profile.agg(F.array(*[F.sum(F.col("normed_features")[i]) for i in range(n)]).alias("sum"))
d_plus.show()

# TODO: linear combination of positive - negative and compare the result to the "to recommend" tweet, transform result vector anyhow into a percentage

from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix
#https://stackoverflow.com/questions/46758768/calculating-the-cosine-similarity-between-all-the-rows-of-a-dataframe-in-pyspark

mat = IndexedRowMatrix(
    data.select("engaging_user_id_idx", "normed_features")\
        .rdd.map(lambda row: IndexedRow(row.engaging_user_id_idx, row.normed_features.toArray()))).toBlockMatrix()
dot = mat.multiply(mat.transpose())
dot.toLocalMatrix().toArray()

sc.stop()

# Content Based approach

Our approach was to treat the BERT tokens like words and generate an id-idf feature vector. Every tweet is represented as a sparse feature vector, the user profile is generated with every tweet which the user has engaged with. 

We tried to use Rocchio's Method to generate relevance feedback for a user. This method was not feasable, due to the massive amount of features. For Rocchio's Method past rated items are split into two classes, positive feedback and negative feedback. To compute the user profile, one has to aggregate each feature vector from the positive and negative ones. This step was crucial for performance, because the aggreation of a sparse vector with ~32.000 features did not finish in a reasonable time.

## Vector aggregation in pyspark

It is neccessary in pyspark to transform a SparseVector into an intermediate format, in this case an array, to perform aggregate functions. So for the linear combination of our user feedback, we had to aggregate arrays instead of SparseVectors, which is also a factor for the poor performance. 
