### Binary Classifier on a single label using text_tokens as content

In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
import twitter_preproc

conf = SparkConf().setAll([
    ("num-executors", 4), 
    ("total-executor-cores", 16), 
    ("executor-memory", "8g"),
    ("spark.yarn.executor.memoryOverhead", "64g")])
sc = SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [3]:
datapath = "///tmp/traintweet_10k.tsv"
user = "engaging_user_id"

Load DF and change Timestamp/None to 1/0 in target column

In [64]:
import importlib
from pyspark.sql.functions import when

importlib.reload(twitter_preproc)
preproc = twitter_preproc.twitter_preproc(spark, sc, datapath, method="CB")
df = preproc.getDF()

### Handle BERT tokens like words, maybe TODO: find deeper meaning in the tokens(embeddings)

In [65]:
from pyspark.ml.feature import RegexTokenizer,NGram,CountVectorizer,IDF,StringIndexer
from pyspark.ml import Pipeline

stringIndexer = StringIndexer(inputCol=user, outputCol=user+"_idx")
regexTokenizer = RegexTokenizer(inputCol="text_tokens", outputCol="terms", pattern="\t")
cv = CountVectorizer(inputCol="terms", outputCol="vector")
idf = IDF(inputCol="vector", outputCol="features")
pipeline = Pipeline(stages=[stringIndexer,regexTokenizer, cv,idf])

model = pipeline.fit(df)


In [66]:
transformed = model.transform(df)

In [67]:
transformed.select("features",user+"_idx",target).groupBy(user+"_idx").count().orderBy("count",ascending=False).show(10)

+--------------------+-----+
|engaging_user_id_idx|count|
+--------------------+-----+
|                 0.0|   12|
|                 2.0|    9|
|                 1.0|    9|
|                 4.0|    7|
|                 3.0|    7|
|                 7.0|    6|
|                 6.0|    6|
|                 8.0|    6|
|                 5.0|    6|
|                19.0|    5|
+--------------------+-----+
only showing top 10 rows



In [68]:
test_user_idx = 0.0
user_profile = transformed.filter(transformed.engaging_user_id_idx.isin([test_user_idx]))                                                                                                                                          

In [69]:
user_profile.select("features","like_timestamp","retweet_with_comment_timestamp","reply_timestamp","retweet_timestamp").show()

+--------------------+--------------+------------------------------+---------------+-----------------+
|            features|like_timestamp|retweet_with_comment_timestamp|reply_timestamp|retweet_timestamp|
+--------------------+--------------+------------------------------+---------------+-----------------+
|(31642,[0,1,2,3,4...|    1581265589|                          null|           null|             null|
|(31642,[0,3,4,5,5...|    1581083103|                          null|           null|       1581083104|
|(31642,[0,3,4,5,1...|    1581058286|                          null|     1581058286|             null|
|(31642,[3,4,55,59...|    1581179252|                          null|           null|             null|
|(31642,[0,1,2,3,4...|    1581139106|                          null|           null|             null|
|(31642,[0,2,3,4,1...|    1581257052|                          null|           null|             null|
|(31642,[0,1,2,3,4...|    1581413344|                          null|     

In [None]:
#sc.stop()