# ML All-Stars: This script generates the scores to beat. We should try to extend the below results.

In [2]:
import cPickle as pickle
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

  from numpy.core.umath_tests import inner1d


In [7]:
# YOU MUST SET THIS TO THE ROOT DATA DIRECTORY FROM THE DOWNLOADED ZIP FILE
DATA_DIR = "/home/adam/RelNet/data/prediction"

## Data loading and pre-processing....

In [20]:
# loading handcrafted features
meta_features = {}
meta_labels = {}
numBurst = 0
total = 0
with open(DATA_DIR+"/detailed_data/handcrafted_features.tsv") as fp:
    for line in fp:
        info = line.split()
        meta_features[info[0]] = np.array(map(float, info[-1].split(",")))
        meta_labels[info[0]] = 1 if info[1] == "burst" else 0
        if info[1] == "burst":
            numBurst +=1
        total += 1

In [21]:
print(numBurst)
print(total)
print(numBurst/float(total))

15078
256723
0.0587325638918


In [22]:
# loading the user, source, and target community embeddings for all examples
with open(DATA_DIR + "/detailed_data/full_ids.txt") as fp:
    ids = {id.strip():i for i, id in enumerate(fp.readlines())}
all_embeds = np.load(open(DATA_DIR + "/detailed_data/full_embeds.npy"))

In [23]:
# loading the post embeddings from the LSTM 
lstm_embeds = np.load(open(DATA_DIR + "/detailed_data/lstm_embeds.npy"))
lstm_ids = pickle.load(open(DATA_DIR + "/detailed_data/lstm_embeds-ids.pkl"))
lstm_ids = {id:i for i, id in enumerate(lstm_ids)}

In [24]:
import torch

# loading preprocessed lstm data to ensure identical train/val/test splits
train_data = pickle.load(open(DATA_DIR + "/preprocessed_train_data.pkl"))
val_data = pickle.load(open(DATA_DIR + "/preprocessed_val_data.pkl"))
test_data = pickle.load(open(DATA_DIR + "/preprocessed_test_data.pkl"))

In [25]:
# flattening the preprocessed LSTM data (no need for minibatching here....)
def flatten(data):
    ids, text, users, subreddits, lengths,sfs, labels = [], [], [], [], [], [], []
    for batch in data:
        bids, btext, busers, bsubreddits, blengths, bsfs, blabels = batch
        ids.extend(bids)
        text.extend(btext.numpy().tolist())
        users.extend(busers.numpy().tolist())
        subreddits.extend(bsubreddits.numpy().tolist())
        lengths.extend(blengths)
        labels.extend(blabels)
        sfs.extend(bsfs)
    return (ids, text, users, subreddits, lengths, labels)
flat_train_data = flatten(train_data)
flat_val_data = flatten(val_data)
flat_test_data = flatten(test_data)

In [85]:
# Create all conflict examples

conflictEmbeds = []
conflictIDS = []
nonflictEmbeds = []
nonflictIDS = []
conflict_Y = []
nonflict_Y = []

for index, ID in enumerate(flat_train_data[0]):
    if meta_labels[ID] == 1:
        conflictEmbeds.append(np.concatenate([meta_features[ID].reshape(len(meta_features[ID]),), all_embeds[ids[ID]], lstm_embeds[lstm_ids[ID]]]))
        conflict_Y.append([meta_labels[ID]])
        conflictIDS.append(ID)
    elif len(nonflictIDS) != 10000:
        nonflictEmbeds.append(np.concatenate([meta_features[ID].reshape(len(meta_features[ID]),), all_embeds[ids[ID]], lstm_embeds[lstm_ids[ID]]]))
        nonflict_Y.append([meta_labels[ID]])
        nonflictIDS.append(ID)
    if len(conflictIDS) == 10000:
        break

In [86]:
nonflictIDS = nonflictIDS[:len(nonflictIDS) - (len(nonflictIDS)-len(conflictIDS))]
nonflictEmbeds = nonflictEmbeds[:len(nonflictEmbeds) - (len(nonflictEmbeds)-len(conflictEmbeds))]
nonflict_Y = nonflict_Y[:len(nonflict_Y) - (len(nonflict_Y)-len(conflict_Y))]

# Should all be the same
print(len(nonflictIDS))
print(len(conflictIDS))
print(len(nonflictEmbeds))
print(len(conflictEmbeds))
print(len(conflict_Y))
print(len(nonflict_Y))

8461
8461
8461
8461
8461
8461


In [87]:
conflictMat = np.stack(conflictEmbeds)
np.save('conflict_dev.npy', conflictMat, allow_pickle=False)
conflict_YMat = np.stack(conflict_Y)
np.save('conflict_Y_dev.npy', conflict_YMat, allow_pickle=False)

with open('conflictIDs_dev.txt', mode='wt') as myfile:
    myfile.write('\n'.join(str(line) for line in conflictIDS))

nonflictMat = np.stack(nonflictEmbeds)
np.save('nonconflict_dev.npy', nonflictMat, allow_pickle=False)
nonflict_YMat = np.stack(nonflict_Y)
np.save('nonconflict_Y_dev.npy', nonflict_YMat, allow_pickle=False)

with open('nonconflictIDs_dev.txt', mode='wt') as myfile:
    myfile.write('\n'.join(str(line) for line in nonflictIDS))

In [8]:
train_X = np.stack([np.concatenate([meta_features[i], all_embeds[ids[i]], lstm_embeds[lstm_ids[i]]]) for i in flat_train_data[0]])
val_X =  np.stack([np.concatenate([meta_features[i], all_embeds[ids[i]], lstm_embeds[lstm_ids[i]]]) for i in flat_val_data[0] if i in meta_features])
test_X =  np.stack([np.concatenate([meta_features[i],all_embeds[ids[i]], lstm_embeds[lstm_ids[i]]]) for i in flat_test_data[0] if i in meta_features])

train_Y = np.stack([meta_labels[i] for i in flat_train_data[0] if i in meta_features])
val_Y =  np.stack([meta_labels[i] for i in flat_val_data[0] if i in meta_features])
test_Y =  np.stack([meta_labels[i] for i in flat_test_data[0] if i in meta_features])

In [16]:
# Hey cmpt419 all-stars: The below will create small version of the data for developing with.

print(train_X[0:5000].shape)
#Human readable data
np.savetxt('train_X_dev.txt', train_X[0:5000])
np.save('train_X_dev.npy', train_X[0:5000], allow_pickle=False)

print(val_X.shape)
np.savetxt('val_X_dev.txt', val_X[0:600])
np.save('val_X_dev.npy', val_X[0:600], allow_pickle=False)

print(test_X.shape)
np.savetxt('test_X_dev.txt', test_X[0:600])
np.save('test_X_dev.npy', test_X[0:600], allow_pickle=False)

print(train_Y.shape)
np.savetxt('train_Y_dev.txt', train_Y[0:5000])
np.save('train_Y_dev.npy', train_Y[0:5000], allow_pickle=False)

print(val_Y.shape)
np.savetxt('val_Y_dev.txt', val_Y[0:600])
np.save('val_Y_dev.npy', val_Y[0:600], allow_pickle=False)

print(test_Y.shape)
np.savetxt('test_Y_dev.txt', test_Y[0:600])
np.save('test_Y_dev.npy', test_Y[0:600], allow_pickle=False)

(5000, 1227)
(11264, 1227)
(11264, 1227)
(93696,)
(11264,)
(11264,)


## Running the models

#### Baseline model 

In [9]:
# First we run the Random Forest with only the metadata/handcrafted features...
baseline_mod = RandomForestClassifier(n_estimators=500, n_jobs=100, random_state=0)
# note that the first 263 features are the handcrafted ones... 
baseline_mod.fit(train_X[:, :263], train_Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=100, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [10]:
# For reference, on the authors server we get 0.682
print roc_auc_score(val_Y, baseline_mod.predict_proba(val_X[:, :263])[:,1])

0.6823965206409768


In [11]:
# For reference, on the authors server we get 0.667
print roc_auc_score(test_Y, baseline_mod.predict_proba(test_X[:, :263])[:,1])

0.6656650084718871


In [12]:
# First we run the Random Forest with only the metadata/handcrafted features...
ensemble_mod = RandomForestClassifier(n_estimators=500, n_jobs=100, random_state=0)
# note that the first 263 features are the handcrafted ones... 
ensemble_mod.fit(train_X[:, :], train_Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=100, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [13]:
# For reference, on the authors server we get 0.765
print roc_auc_score(val_Y, ensemble_mod.predict_proba(val_X[:, :])[:,1])

0.7648275951540929


In [14]:
# For reference, on the authors server we get 0.756
print roc_auc_score(test_Y, ensemble_mod.predict_proba(test_X[:, :])[:,1])

0.7564232433560953
