In [1]:
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse import vstack
import twitter_preproc
import importlib
importlib.reload(twitter_preproc)
from twitter_preproc import *
from operator import attrgetter
import pandas as pd
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from twitter_preproc import twitter_preproc

from pyspark.ml.feature import OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer

import torch.nn as nn
import torch
import torch.nn.functional as F
import math
import torch.optim as optim

import sys, traceback

import NNPreprocessor as NNP
importlib.reload(NNP)
import NNCFNet
importlib.reload(NNCFNet)

from sklearn.metrics import roc_auc_score

import json

In [2]:
# Building Spark Context
conf = SparkConf()
spark = SparkSession.builder.appName("nncf_eval").config(conf=conf).getOrCreate()
sc = spark.sparkContext

# LOAD DATA 

In [3]:
train = "///tmp/traintweet_1000.tsv" # irrelevant; only needed for preproc class to shut up
test = "///tmp/supersecret_ensembletrain5k_bootstrap.tsv"
val = "///tmp/supersecret_test5k_bootstrap.tsv"
model_PATH = '../misc/NNCF_model_save_5k.pth'

# basic preprocessing
preproc = twitter_preproc(spark, sc, val, testFile=test, MF=True)
test_df = preproc.getTestDF()

In [4]:
# neural network specific pre-processing
nnpre = NNP.NNPreprocessor()
unpadded_users, unpadded_tweets, unpadded_target = nnpre.nn_preprocess(test_df)
tweets, users, target = nnpre.pad(unpadded_tweets, unpadded_users, unpadded_target, 5000)
tweets.shape, users.shape, target.shape

(torch.Size([5012, 5000]), torch.Size([5012, 5000]), torch.Size([5012, 1]))

In [5]:
# parse target variable
flat_target = target.detach().numpy().flatten().astype(float)

# LOAD MODEL

In [6]:
model = NNCFNet.Net(users.shape[1], tweets.shape[1], 32)
model.load_state_dict(torch.load(model_PATH))
model.eval()

Net(
  (dense1): Linear(in_features=5000, out_features=32, bias=True)
  (dense2): Linear(in_features=5000, out_features=32, bias=True)
  (fc1): Linear(in_features=64, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=16, bias=True)
  (fc3): Linear(in_features=16, out_features=1, bias=True)
)

# PREDICT

In [7]:
def predict(user, tweets, round_to_class=False):
    # run
    outputs = model(users, tweets)

    # label classes
    probabilities = outputs.detach().numpy().flatten()
    
    if round_to_class:
        return probabilities.round()
    else:
        threshold = probabilities > probabilities.mean()
        return threshold.astype(float)
    
prediction = predict(users, tweets)

# EXPORT

In [8]:
# establish original order
order_df = test_df.withColumn("original_order", monotonically_increasing_id())
order_df = order_df.select("engaging_user_id", "tweet_id", 'original_order')

# get preprocessing order
sorting_tweets = nnpre.get_id_indices(order_df, id_column='tweet_id')
sorting_users = nnpre.get_id_indices(order_df, id_column='engaging_user_id')

# join labels and sort back to original order
result = order_df.join(sorting_tweets, 'tweet_id').sort('original_order').rdd.map(lambda x: (x['engaging_user_id'], x['tweet_id'], prediction[x['tweet_id_index']]))

In [9]:
def toCSVLine(data):
    return ','.join(str(d) for d in data)

lines = result.map(toCSVLine)
lines.saveAsTextFile('prediction_test5k_bootstrap.csv')