### Binary Classifier on a single label using text_tokens as content

In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
import twitter_preproc

conf = SparkConf().setAll([
    ("num-executors", 4), 
    ("total-executor-cores", 16), 
    ("executor-memory", "8g"),
    ("spark.yarn.executor.memoryOverhead", "64g")])
sc = SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [2]:
datapath = "///tmp/traintweet_10k.tsv"
user = "engaging_user_id"

Load DF and change Timestamp/None to 1/0 in target column

In [3]:
import importlib
from pyspark.sql.functions import when

importlib.reload(twitter_preproc)
preproc = twitter_preproc.twitter_preproc(spark, sc, datapath, method="CB")
df = preproc.getDF()

### Handle BERT tokens like words, maybe TODO: find deeper meaning in the tokens(embeddings)

In [4]:
from pyspark.ml.feature import RegexTokenizer,NGram,CountVectorizer,IDF,StringIndexer,Normalizer
from pyspark.ml import Pipeline

stringIndexer = StringIndexer(inputCol=user, outputCol=user+"_idx")
regexTokenizer = RegexTokenizer(inputCol="text_tokens", outputCol="terms", pattern="\t")
cv = CountVectorizer(inputCol="terms", outputCol="vector")
idf = IDF(inputCol="vector", outputCol="features")
normalizer=Normalizer(inputCol="features",outputCol="normed_features")
pipeline = Pipeline(stages=[stringIndexer,regexTokenizer, cv,idf,normalizer])

model = pipeline.fit(df)


In [5]:
data = model.transform(df)
data = data.select("normed_features","engaging_user_id_idx","tweet_id","like_timestamp")

# Relevance Feedback with Rocchios method 
7, S.36

for now with a single user and selected tweeet yeet

In [12]:
tweet_id = "E7D6C5094767223F6F8789A87A1937AB"
user_id = 1.0
features = "normed_features"
target = "like_timestamp"

tweet_vector = data.where(data["tweet_id"] == tweet_id)
user_vectors = data.where(data["engaging_user_id_idx"] == user_id)

In [18]:
user_vectors.select("normed_features","like_timestamp").show(20)

+--------------------+--------------+
|     normed_features|like_timestamp|
+--------------------+--------------+
|(31642,[3,4,5,7,1...|    1581004554|
|(31642,[0,1,2,3,4...|    1581109718|
|(31642,[0,1,3,4,5...|    1581437591|
|(31642,[0,1,2,3,4...|    1581182147|
|(31642,[0,1,2,3,4...|    1581190598|
|(31642,[0,1,2,3,4...|    1580960533|
|(31642,[0,1,2,3,4...|    1581530653|
|(31642,[0,1,2,3,4...|    1581367528|
|(31642,[0,3,4,5,7...|    1581182147|
+--------------------+--------------+



## Calculate linear combination of user_vectors

In [14]:
positive_feedback = user_vectors.where(user_vectors[target].isNotNull())
negative_feedback = user_vectors.where(user_vectors[target].isNull())

In [15]:
positive_feedback.select("normed_features").show(10)

+--------------------+
|     normed_features|
+--------------------+
|(31642,[3,4,5,7,1...|
|(31642,[0,1,2,3,4...|
|(31642,[0,1,3,4,5...|
|(31642,[0,1,2,3,4...|
|(31642,[0,1,2,3,4...|
|(31642,[0,1,2,3,4...|
|(31642,[0,1,2,3,4...|
|(31642,[0,1,2,3,4...|
|(31642,[0,3,4,5,7...|
+--------------------+



In [16]:
negative_feedback.select("normed_features").show(10)

+---------------+
|normed_features|
+---------------+
+---------------+



# TODO: linear combination of positive - negative and compare the result to the "to recommend" tweet, transform result vector anyhow into a percentage

from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix
#https://stackoverflow.com/questions/46758768/calculating-the-cosine-similarity-between-all-the-rows-of-a-dataframe-in-pyspark

mat = IndexedRowMatrix(
    data.select("engaging_user_id_idx", "normed_features")\
        .rdd.map(lambda row: IndexedRow(row.engaging_user_id_idx, row.normed_features.toArray()))).toBlockMatrix()
dot = mat.multiply(mat.transpose())
dot.toLocalMatrix().toArray()

In [4]:
sc.stop()

NameError: name 'sc' is not defined