In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
import importlib

# Building Spark Context
# conf = SparkConf().setAll([('spark.executor.memory', '32g'), ('spark.executor.instances','8'),('spark.executor.cores', '12'), ('spark.driver.memory','64g'), ('spark.driver.memoryOverhead', '64g')])
conf = SparkConf()
spark = SparkSession.builder.appName("nncf_train").config(conf=conf).getOrCreate()
sc = spark.sparkContext

# GET TRAINING DATA

In [2]:
import twitter_preproc

base = "///tmp/"
one_k = "traintweet_1000.tsv"
ensemble_train = 'supersecret_ensembletrain5k_bootstrap.tsv'
ensemble_test = 'supersecret_test5k_bootstrap.tsv'
choice = ensemble_train

preproc = twitter_preproc.twitter_preproc(spark, sc, base+choice, MF=True)
traindata = preproc.getDF()

# NN PREPROCESSING

In [3]:
import nnpreprocessor
importlib.reload(nnpreprocessor)

nnp = nnpreprocessor.NNPreprocessor()
engagement = 'retweet_comment'
tweets, users, target = nnp.nn_preprocess(traindata)

# TRAIN

In [4]:
from NNCFNet import Net
import torch
import torch.nn as nn
import torch.optim as optim
import sys, traceback

# Initalize Hyperparameters
k = 32
n_epochs = 2
batch_size = 256

# Initialize Neural Network
net = Net(users.shape[1], tweets.shape[1], k)
optimizer = optim.SGD(net.parameters(), lr=0.001)
criterion = nn.BCELoss()
output = net(users, tweets)

# Start training
print("\nStart Training")
for epoch in range(n_epochs):
    print("epoch ", epoch+1)

    permutation = torch.randperm(users.size()[0])

    for i in range(0,users.size()[0], batch_size):
        optimizer.zero_grad()

        indices = permutation[i:i+batch_size]
        batch_x_user = users[indices]
        batch_x_tweet = tweets[indices]
        batch_y = target[indices]

        outputs = net.forward(batch_x_user, batch_x_tweet)
        loss = criterion(outputs,batch_y)
        loss.backward()
        optimizer.step()

        print(loss)


Start Training
epoch  1
tensor(0.6872, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.6823, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.6917, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.6833, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.6863, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.6868, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.6858, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.6878, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.6868, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.6892, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.6833, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.6838, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.6927, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.6848, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.6942, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.6912, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.6793, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.6833, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.6927, grad_f

# RE-CREATE INPUT FORMAT & ORDER

In [22]:
from pyspark.sql.functions import monotonically_increasing_id
import numpy as np

# get predictions
net.eval()
prediction = net(users, tweets)
p_vec = prediction.detach().numpy().flatten()
scaled = (p_vec - np.min(p_vec))/np.ptp(p_vec)
probabilities = [float(x) for x in scaled]

# establish original order
order_df = traindata.withColumn("original_order", monotonically_increasing_id())
order_df = order_df.select("engaging_user_id", "tweet_id", 'original_order')
sorting_tweets = nnp.get_id_indices(order_df, id_column='tweet_id')

# join labels
result = order_df.join(sorting_tweets, 'tweet_id').sort('original_order').rdd.map(lambda x: (x['engaging_user_id'], x['tweet_id'], probabilities[x['tweet_id_index']], x['original_order']))

# ensure order
result_df = spark.createDataFrame(result).toDF('engaging_user_id', 'tweet_id', 'target', 'original_order')
clean = result_df.dropDuplicates(['engaging_user_id', 'tweet_id']).sort('original_order').select('engaging_user_id', 'tweet_id', 'target').rdd

In [6]:
output_PATH = engagement + '.' + choice

def toCSVLine(data):
    return ','.join(str(d) for d in data)

lines = clean.map(toCSVLine)
lines.saveAsTextFile(output_PATH)