In [1]:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)
print("Num GPUs Available: ", len(physical_devices))
if physical_devices:
    try:
        for gpu in physical_devices:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

Num GPUs Available:  1


In [78]:
import torch
from transformers import TFLongformerModel, LongformerTokenizerFast, LongformerConfig
import tensorflow as tf

config = LongformerConfig.from_pretrained('allenai/longformer-base-4096')
# choose the attention mode 'n2', 'tvm' or 'sliding_chunks'
# 'n2': for regular n2 attantion
# 'tvm': a custom CUDA kernel implementation of our sliding window attention
# 'sliding_chunks': a PyTorch implementation of our sliding window attention
config.attention_mode = 'sliding_chunks'

model = TFLongformerModel.from_pretrained('allenai/longformer-base-4096', config = config)
tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096')
tokenizer.model_max_length = model.config.max_position_embeddings

def LFencode(row):
    SAMPLE_TEXT = row['selftext']
    input_ids = tf.expand_dims(tf.convert_to_tensor(tokenizer.encode(SAMPLE_TEXT)), 0) # batch of size 1

    # model = model.cuda(); input_ids = input_ids.cuda()

    # Attention mask values -- 0: no attention, 1: local attention, 2: global attention
    attention_mask = tf.ones(input_ids.shape, dtype=tf.int32) # initialize to local attention

    outputs = model(input_ids, attention_mask=attention_mask)
    pooled_output = outputs.pooler_output
    return pooled_output

Some layers from the model checkpoint at allenai/longformer-base-4096 were not used when initializing TFLongformerModel: ['lm_head']
- This IS expected if you are initializing TFLongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFLongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFLongformerModel were initialized from the model checkpoint at allenai/longformer-base-4096.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFLongformerModel for predictions without further training.


In [4]:
import pandas as pd
import numpy as np
nosleepDf = pd.read_csv('Download/Cleaned Data/NoSleep.csv')
selfDf = pd.read_csv('Download/Cleaned Data/Self.csv')

In [5]:
nosleepDfShort = nosleepDf[:2000].copy()
selfDfShort = selfDf[:2000].copy()

In [40]:
from tqdm.notebook import tqdm
tqdm.pandas()

selfDfShort['LF pooler output']= selfDfShort.progress_apply(LFencode, axis=1)
nosleepDfShort['LF pooler output']= nosleepDfShort.progress_apply(LFencode, axis=1)

selfDfShort.to_pickle('Download/Cleaned Data with Longformer/selfDfShort.pkl')
nosleepDfShort.to_pickle('Download/Cleaned Data with Longformer/nosleepDfShort.pkl')

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

In [6]:
import pandas as pd


nosleepDfShort = pd.read_pickle('Download/Cleaned Data with Longformer/nosleepDfShort.pkl')
selfDfShort = pd.read_pickle('Download/Cleaned Data with Longformer/selfDfShort.pkl')



In [7]:
nosleepLF = tf.stack(nosleepDfShort.loc[:,'LF pooler output'].to_list())
selfLF = tf.stack(selfDfShort.loc[:,'LF pooler output'].to_list())

nosleepLF = tf.reshape(nosleepLF, nosleepLF.shape[::2])
selfLF = tf.reshape(selfLF, selfLF.shape[::2])

In [8]:
import numpy as np
from sklearn.manifold import TSNE
import seaborn as sns

%matplotlib widget
import matplotlib.pyplot as plt

X_embedded_nosleep = TSNE(perplexity=30, learning_rate=50, n_components=3, n_iter=5000).fit_transform(nosleepLF)
X_embedded_self = TSNE(perplexity=30, learning_rate=50, n_components=3, n_iter=2000).fit_transform(selfLF)

df1 = pd.DataFrame(X_embedded_nosleep, columns=['x','y', 'z'])
df1['subreddit'] = 'nosleep'
df2 = pd.DataFrame(X_embedded_self, columns=['x','y', 'z'])
df2['subreddit'] = 'self'

df = pd.concat([df1,df2])

In [9]:
df

Unnamed: 0,x,y,z,subreddit
0,3.435028,-23.430191,27.846485,nosleep
1,-0.453467,13.920367,-1.642473,nosleep
2,9.754526,26.287334,-27.450058,nosleep
3,-3.615822,-24.700964,24.934481,nosleep
4,-1.820519,22.547499,3.794347,nosleep
...,...,...,...,...
1995,6.502117,-14.733069,13.893977,self
1996,3.216796,-5.083644,-8.147502,self
1997,-24.140860,2.374579,15.830630,self
1998,-5.080696,-0.582466,-19.006369,self


In [10]:
from mpl_toolkits.mplot3d import Axes3D,axes3d
sns.set(style = "darkgrid")



fig = plt.figure()
fig.set_size_inches(18.5, 10.5)
ax = fig.add_subplot(111, projection = '3d')

ax.set_xlabel("x")
ax.set_ylabel("y")
ax.set_zlabel("z")

ax.scatter(df[df['subreddit']=='nosleep']['x'], df[df['subreddit']=='nosleep']['y'], df[df['subreddit']=='nosleep']['z'], c='red', label='nosleep')
ax.scatter(df[df['subreddit']=='self']['x'], df[df['subreddit']=='self']['y'], df[df['subreddit']=='self']['z'], c='blue', label='self')
plt.legend()

plt.show()


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# Tensorflow

In [11]:
len(nosleepLF)

2000

In [12]:
creepy_features = nosleepLF
creepy_labels = np.ones(len(creepy_features))
creepy_features

<tf.Tensor: shape=(2000, 768), dtype=float32, numpy=
array([[ 0.17356455, -0.28822845,  0.08040078, ..., -0.04949597,
         0.17774035,  0.03145612],
       [ 0.152154  , -0.3131358 ,  0.09661903, ..., -0.0544746 ,
         0.20656316, -0.00459675],
       [ 0.193991  , -0.3316983 ,  0.09307294, ..., -0.06202645,
         0.20297652,  0.02324418],
       ...,
       [ 0.17017554, -0.34362826,  0.10915487, ..., -0.06822307,
         0.20495556,  0.02520667],
       [ 0.19672579, -0.34024084,  0.09063036, ..., -0.08465302,
         0.18416663, -0.0188603 ],
       [ 0.18555108, -0.3117493 ,  0.12184663, ..., -0.06562941,
         0.18309756, -0.02888302]], dtype=float32)>

In [13]:
noncreepy_features = selfLF
noncreepy_labels = np.zeros(len(noncreepy_features))
noncreepy_features

<tf.Tensor: shape=(2000, 768), dtype=float32, numpy=
array([[ 1.61632180e-01, -2.65369445e-01,  9.51929018e-02, ...,
        -7.22304583e-02,  1.92816019e-01,  1.56589076e-02],
       [ 1.28829047e-01, -2.89937794e-01,  7.89543763e-02, ...,
        -4.83777523e-02,  1.91524208e-01,  6.91677677e-03],
       [ 1.70354187e-01, -3.17211837e-01,  1.21957265e-01, ...,
        -6.16075248e-02,  1.82307720e-01,  8.63182265e-03],
       ...,
       [ 1.67272553e-01, -2.53856778e-01,  9.05781314e-02, ...,
        -7.19489008e-02,  1.67981252e-01,  2.79515982e-04],
       [ 1.72974482e-01, -3.11384588e-01,  1.12448640e-01, ...,
        -6.59727380e-02,  2.19963238e-01, -2.19200202e-03],
       [ 1.66960493e-01, -2.52201796e-01,  1.04491428e-01, ...,
        -8.50258991e-02,  1.67221159e-01,  2.78245900e-02]], dtype=float32)>

In [14]:
creepy_features.shape[0]

2000

In [15]:
features = np.concatenate((creepy_features, noncreepy_features))
labels = np.concatenate((creepy_labels, noncreepy_labels))
print(features, labels)

[[ 1.7356455e-01 -2.8822845e-01  8.0400780e-02 ... -4.9495969e-02
   1.7774035e-01  3.1456120e-02]
 [ 1.5215400e-01 -3.1313580e-01  9.6619032e-02 ... -5.4474600e-02
   2.0656316e-01 -4.5967521e-03]
 [ 1.9399101e-01 -3.3169830e-01  9.3072943e-02 ... -6.2026449e-02
   2.0297652e-01  2.3244182e-02]
 ...
 [ 1.6727255e-01 -2.5385678e-01  9.0578131e-02 ... -7.1948901e-02
   1.6798125e-01  2.7951598e-04]
 [ 1.7297448e-01 -3.1138459e-01  1.1244864e-01 ... -6.5972738e-02
   2.1996324e-01 -2.1920020e-03]
 [ 1.6696049e-01 -2.5220180e-01  1.0449143e-01 ... -8.5025899e-02
   1.6722116e-01  2.7824590e-02]] [1. 1. 1. ... 0. 0. 0.]


In [16]:
print(features.shape, labels.shape)

(4000, 768) (4000,)


In [17]:
from sklearn.utils import shuffle

features, labels = shuffle(features, labels)
print(features, labels)

[[ 0.17372166 -0.31675315  0.09627467 ... -0.06102977  0.19031945
  -0.01382293]
 [ 0.18293348 -0.28995857  0.08175075 ... -0.0694469   0.18828677
   0.00495734]
 [ 0.19889185 -0.34668326  0.08101564 ... -0.06365429  0.18032654
   0.0013986 ]
 ...
 [ 0.19230823 -0.29125318  0.06716394 ... -0.06317285  0.17826806
   0.03855895]
 [ 0.16728517 -0.2878742   0.09056299 ... -0.07442877  0.17100859
  -0.02031929]
 [ 0.18101485 -0.3242344   0.09589295 ... -0.06436511  0.16940033
  -0.00677954]] [0. 0. 1. ... 0. 0. 1.]


In [18]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_features = scaler.fit_transform(features)
print(scaled_features, labels)

[[0.50433403 0.3520987  0.44063362 ... 0.5626604  0.4912008  0.35618275]
 [0.58400923 0.498618   0.31242284 ... 0.51314497 0.4746709  0.47976592]
 [0.7220369  0.18843353 0.30593362 ... 0.5472211  0.4099375  0.45634773]
 ...
 [0.6650936  0.49153876 0.18365696 ... 0.55005324 0.39319777 0.7008805 ]
 [0.4486634  0.51001585 0.39021346 ... 0.48383814 0.33416295 0.31343365]
 [0.5674146  0.3111894  0.437264   ... 0.54303956 0.3210845  0.40253165]] [0. 0. 1. ... 0. 0. 1.]


- [ ] 70% train, 15% val, 15% test
 - Train: 26500
 - Valid: 5677
 - Test: 5669
- [x] 80% train, 10% val, 10% test
- [ ] 60% train, 20% val, 20% test

In [19]:
scaled_features[:,0]

array([0.50433403, 0.58400923, 0.7220369 , ..., 0.6650936 , 0.4486634 ,
       0.5674146 ], dtype=float32)

In [20]:
scaled_features.shape

(4000, 768)

In [21]:
from sklearn.model_selection import train_test_split

X = scaled_features
y = labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=1)

In [22]:
import tensorflow as tf
from tensorflow import keras
from tensorboard.plugins import projector
from keras.layers import Embedding

In [23]:
embedding_layer = Embedding(vocab_size,
                            768,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

NameError: name 'vocab_size' is not defined

# Model here

In [24]:
modelTF = keras.Sequential([
#     keras.layers.Dense(units = 303, input_shape = (769,), activation = 'relu'),
#     keras.layers.Dropout(0.5),
    keras.layers.Dense(units = 303, input_shape = (768,), activation = 'relu'),
    keras.layers.Dropout(0.1),
#     keras.layers.Dense(units = 256, activation = 'relu'),
    keras.layers.Dense(units = 128, activation = 'relu'),
    keras.layers.Dense(units = 64, activation = 'relu'),
    keras.layers.Dense(units = 32, activation = 'relu'),
    keras.layers.Dense(units = 16, activation = 'relu'),
    keras.layers.Dense(units = 1, activation = 'sigmoid') # here the units must be 1 in order for binary classifications to work
])

In [25]:
modelTF.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 303)               233007    
_________________________________________________________________
dropout_49 (Dropout)         (None, 303)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               38912     
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_3 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_4 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 1

In [26]:
modelTF.compile(loss = 'binary_crossentropy',
             optimizer = keras.optimizers.Adam(lr=0.000959, beta_1 = 0.9, beta_2=0.999), # you can tune the learning rate here. Default lr = 0.01
             metrics=['accuracy'])

In [27]:
import os
root_logdir = os.path.join(os.curdir,"tensorboard_logs", "longformer")

def get_run_log_dir():
    import time
    run_id = time.strftime("run_%Y_%m_%d-%H_%M_%S")
    return os.path.join(root_logdir, run_id)

run_logdir = get_run_log_dir()

tensorboard_cb = keras.callbacks.TensorBoard(run_logdir)

history = modelTF.fit(X_train, y_train, epochs = 30, 
                   validation_data=(X_val, y_val),
                   callbacks=[tensorboard_cb])

%load_ext tensorboard
%tensorboard --logdir tensorboard_logs

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


Reusing TensorBoard on port 6006 (pid 12362), started 0:05:11 ago. (Use '!kill 12362' to kill it.)

In [28]:
import matplotlib.pyplot as plt
pd.DataFrame(history.history).plot(figsize = (8,5))
plt.grid = True
plt.gca().set_ylim(0,1)
plt.yticks(np.arange(0, 1.1, 0.1))
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [79]:
modelTF.evaluate(X_test, y_test)



[0.24772435426712036, 0.9024999737739563]

In [103]:
def rw_test(SAMPLE_TEXT):
    input_ids = tf.expand_dims(tf.convert_to_tensor(tokenizer.encode(SAMPLE_TEXT)), 0) # batch of size 1

    # model = model.cuda(); input_ids = input_ids.cuda()

    # Attention mask values -- 0: no attention, 1: local attention, 2: global attention
    attention_mask = tf.ones(input_ids.shape, dtype=tf.int32) # initialize to local attention

    outputs = model(input_ids, attention_mask=attention_mask)
    pooled_output = outputs.pooler_output
    prediction = modelTF.predict(pooled_output)
    return prediction
#     print(np.argmax(prediction, axis = 1))

In [118]:
SAMPLE_TEXT1 = '''
He's holding a knife behind me. He feels creepy to me.
'''
SAMPLE_TEXT2 = '''
The leaves smell so good in the spring. That's why I like spring so much.
'''
SAMPLE_TEXT3 = '''
He's holding a knife behind me. He feels creepy to me.
The leaves smell so good in the spring. That's why I like spring so much.
'''
SAMPLE_TEXT4 = '''
He's holding a knife behind me. He feels creepy to me.
It's a little creepy to have someone like that around.
Everything in this place screamed creepy order, which made her wonder what was wrong with the owner.
'''
SAMPLE_TEXT5 = '''
He ran out of money, so he had to stop playing poker.
If I don’t like something, I’ll stay away from it.
I often see the time 11:11 or 12:34 on clocks.
'''

In [119]:
print('SAMPLE_TEXT1', rw_test(SAMPLE_TEXT1))
print('SAMPLE_TEXT2', rw_test(SAMPLE_TEXT2))
print('SAMPLE_TEXT3', rw_test(SAMPLE_TEXT3))
print('SAMPLE_TEXT4', rw_test(SAMPLE_TEXT4))
print('SAMPLE_TEXT5', rw_test(SAMPLE_TEXT5))

SAMPLE_TEXT1 [[0.62332803]]
SAMPLE_TEXT2 [[0.5364992]]
SAMPLE_TEXT3 [[0.6369032]]
SAMPLE_TEXT4 [[0.7230916]]
SAMPLE_TEXT5 [[0.5673562]]
