In [1]:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)
print("Num GPUs Available: ", len(physical_devices))
if physical_devices:
    try:
        for gpu in physical_devices:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

Num GPUs Available:  1


In [2]:
import torch
from transformers import TFLongformerModel, LongformerTokenizerFast, LongformerConfig
import tensorflow as tf

config = LongformerConfig.from_pretrained('allenai/longformer-base-4096')
# choose the attention mode 'n2', 'tvm' or 'sliding_chunks'
# 'n2': for regular n2 attantion
# 'tvm': a custom CUDA kernel implementation of our sliding window attention
# 'sliding_chunks': a PyTorch implementation of our sliding window attention
config.attention_mode = 'sliding_chunks'

model = TFLongformerModel.from_pretrained('allenai/longformer-base-4096', config = config)
tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096')
tokenizer.model_max_length = model.config.max_position_embeddings

def LFencode(row):
    SAMPLE_TEXT = row['selftext']
    input_ids = tf.expand_dims(tf.convert_to_tensor(tokenizer.encode(SAMPLE_TEXT, max_length=4096, truncation=True)), 0) # batch of size 1

    # model = model.cuda(); input_ids = input_ids.cuda()

    # Attention mask values -- 0: no attention, 1: local attention, 2: global attention
    attention_mask = tf.ones(input_ids.shape, dtype=tf.int32) # initialize to local attention

    outputs = model(input_ids, attention_mask=attention_mask)
    pooled_output = outputs.pooler_output
    return pooled_output

Some layers from the model checkpoint at allenai/longformer-base-4096 were not used when initializing TFLongformerModel: ['lm_head']
- This IS expected if you are initializing TFLongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFLongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFLongformerModel were initialized from the model checkpoint at allenai/longformer-base-4096.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFLongformerModel for predictions without further training.


In [3]:
import pandas as pd
import numpy as np
import os

PATH = 'Download/Cleaned Data'

df_dict = dict()
for csv in os.listdir(PATH):
    if not csv.startswith('.'):
        df_dict[csv] = pd.read_csv(os.path.join(PATH, csv))

In [4]:
from tqdm.notebook import tqdm
tqdm.pandas()
import numpy as np

OUTPUT_PATH = 'Download/Cleaned Data with Longformer'

for df_name, df in tqdm(df_dict.items()):
    if os.path.isfile(os.path.join(OUTPUT_PATH, df_name + '.pkl')):
        continue
    
    df_split = np.array_split(df, 50)
    
    for i in range(len(df_split)):
        if os.path.isfile(os.path.join(OUTPUT_PATH, df_name + str(i) + '.pkl')):
            df_split[i] = pd.read_pickle(os.path.join(OUTPUT_PATH, df_name + str(i) + '.pkl'))
            continue
        print(i)
        df_split[i]['LF pooler output'] = df_split[i].progress_apply(LFencode, axis=1)
        df_split[i].to_pickle(os.path.join(OUTPUT_PATH, df_name + str(i) + '.pkl'))
    pd.concat(df_split).to_pickle(os.path.join(OUTPUT_PATH, df_name + '.pkl'))

  0%|          | 0/9 [00:00<?, ?it/s]

0


  0%|          | 0/1100 [00:00<?, ?it/s]

1


  0%|          | 0/1100 [00:00<?, ?it/s]

2


  0%|          | 0/1100 [00:00<?, ?it/s]

3


  0%|          | 0/1100 [00:00<?, ?it/s]

4


  0%|          | 0/1100 [00:00<?, ?it/s]

5


  0%|          | 0/1100 [00:00<?, ?it/s]

6


  0%|          | 0/1100 [00:00<?, ?it/s]

7


  0%|          | 0/1100 [00:00<?, ?it/s]

8


  0%|          | 0/1100 [00:00<?, ?it/s]

9


  0%|          | 0/1100 [00:00<?, ?it/s]

10


  0%|          | 0/1100 [00:00<?, ?it/s]

11


  0%|          | 0/1100 [00:00<?, ?it/s]

12


  0%|          | 0/1100 [00:00<?, ?it/s]

13


  0%|          | 0/1100 [00:00<?, ?it/s]

14


  0%|          | 0/1100 [00:00<?, ?it/s]

15


  0%|          | 0/1100 [00:00<?, ?it/s]

16


  0%|          | 0/1100 [00:00<?, ?it/s]

17


  0%|          | 0/1100 [00:00<?, ?it/s]

18


  0%|          | 0/1100 [00:00<?, ?it/s]

19


  0%|          | 0/1100 [00:00<?, ?it/s]

20


  0%|          | 0/1100 [00:00<?, ?it/s]

21


  0%|          | 0/1100 [00:00<?, ?it/s]

22


  0%|          | 0/1100 [00:00<?, ?it/s]

23


  0%|          | 0/1100 [00:00<?, ?it/s]

24


  0%|          | 0/1100 [00:00<?, ?it/s]

25


  0%|          | 0/1100 [00:00<?, ?it/s]

26


  0%|          | 0/1100 [00:00<?, ?it/s]

27


  0%|          | 0/1099 [00:00<?, ?it/s]

28


  0%|          | 0/1099 [00:00<?, ?it/s]

29


  0%|          | 0/1099 [00:00<?, ?it/s]

30


  0%|          | 0/1099 [00:00<?, ?it/s]

31


  0%|          | 0/1099 [00:00<?, ?it/s]

32


  0%|          | 0/1099 [00:00<?, ?it/s]

33


  0%|          | 0/1099 [00:00<?, ?it/s]

34


  0%|          | 0/1099 [00:00<?, ?it/s]

35


  0%|          | 0/1099 [00:00<?, ?it/s]

36


  0%|          | 0/1099 [00:00<?, ?it/s]

37


  0%|          | 0/1099 [00:00<?, ?it/s]

38


  0%|          | 0/1099 [00:00<?, ?it/s]

39


  0%|          | 0/1099 [00:00<?, ?it/s]

40


  0%|          | 0/1099 [00:00<?, ?it/s]

41


  0%|          | 0/1099 [00:00<?, ?it/s]

42


  0%|          | 0/1099 [00:00<?, ?it/s]

43


  0%|          | 0/1099 [00:00<?, ?it/s]

44


  0%|          | 0/1099 [00:00<?, ?it/s]

45


  0%|          | 0/1099 [00:00<?, ?it/s]

46


  0%|          | 0/1099 [00:00<?, ?it/s]

47


  0%|          | 0/1099 [00:00<?, ?it/s]

48


  0%|          | 0/1099 [00:00<?, ?it/s]

49


  0%|          | 0/1099 [00:00<?, ?it/s]

0


  0%|          | 0/1856 [00:00<?, ?it/s]

1


  0%|          | 0/1856 [00:00<?, ?it/s]

2


  0%|          | 0/1856 [00:00<?, ?it/s]

3


  0%|          | 0/1856 [00:00<?, ?it/s]

4


  0%|          | 0/1856 [00:00<?, ?it/s]

5


  0%|          | 0/1856 [00:00<?, ?it/s]

6


  0%|          | 0/1856 [00:00<?, ?it/s]

7


  0%|          | 0/1856 [00:00<?, ?it/s]

8


  0%|          | 0/1856 [00:00<?, ?it/s]

9


  0%|          | 0/1856 [00:00<?, ?it/s]

10


  0%|          | 0/1856 [00:00<?, ?it/s]

11


  0%|          | 0/1856 [00:00<?, ?it/s]

12


  0%|          | 0/1856 [00:00<?, ?it/s]

13


  0%|          | 0/1856 [00:00<?, ?it/s]

14


  0%|          | 0/1856 [00:00<?, ?it/s]

15


  0%|          | 0/1856 [00:00<?, ?it/s]

16


  0%|          | 0/1856 [00:00<?, ?it/s]

17


  0%|          | 0/1855 [00:00<?, ?it/s]

18


  0%|          | 0/1855 [00:00<?, ?it/s]

19


  0%|          | 0/1855 [00:00<?, ?it/s]

20


  0%|          | 0/1855 [00:00<?, ?it/s]

21


  0%|          | 0/1855 [00:00<?, ?it/s]

22


  0%|          | 0/1855 [00:00<?, ?it/s]

23


  0%|          | 0/1855 [00:00<?, ?it/s]

24


  0%|          | 0/1855 [00:00<?, ?it/s]

25


  0%|          | 0/1855 [00:00<?, ?it/s]

26


  0%|          | 0/1855 [00:00<?, ?it/s]

27


  0%|          | 0/1855 [00:00<?, ?it/s]

28


  0%|          | 0/1855 [00:00<?, ?it/s]

29


  0%|          | 0/1855 [00:00<?, ?it/s]

30


  0%|          | 0/1855 [00:00<?, ?it/s]

31


  0%|          | 0/1855 [00:00<?, ?it/s]

32


  0%|          | 0/1855 [00:00<?, ?it/s]

33


  0%|          | 0/1855 [00:00<?, ?it/s]

34


  0%|          | 0/1855 [00:00<?, ?it/s]

35


  0%|          | 0/1855 [00:00<?, ?it/s]

36


  0%|          | 0/1855 [00:00<?, ?it/s]

37


  0%|          | 0/1855 [00:00<?, ?it/s]

38


  0%|          | 0/1855 [00:00<?, ?it/s]

39


  0%|          | 0/1855 [00:00<?, ?it/s]

40


  0%|          | 0/1855 [00:00<?, ?it/s]

41


  0%|          | 0/1855 [00:00<?, ?it/s]

42


  0%|          | 0/1855 [00:00<?, ?it/s]

43


  0%|          | 0/1855 [00:00<?, ?it/s]

44


  0%|          | 0/1855 [00:00<?, ?it/s]

45


  0%|          | 0/1855 [00:00<?, ?it/s]

46


  0%|          | 0/1855 [00:00<?, ?it/s]

47


  0%|          | 0/1855 [00:00<?, ?it/s]

48


  0%|          | 0/1855 [00:00<?, ?it/s]

49


  0%|          | 0/1855 [00:00<?, ?it/s]

0


  0%|          | 0/3310 [00:00<?, ?it/s]

1


  0%|          | 0/3310 [00:00<?, ?it/s]

2


  0%|          | 0/3310 [00:00<?, ?it/s]

3


  0%|          | 0/3310 [00:00<?, ?it/s]

4


  0%|          | 0/3310 [00:00<?, ?it/s]

5


  0%|          | 0/3310 [00:00<?, ?it/s]

6


  0%|          | 0/3310 [00:00<?, ?it/s]

7


  0%|          | 0/3310 [00:00<?, ?it/s]

8


  0%|          | 0/3310 [00:00<?, ?it/s]

9


  0%|          | 0/3310 [00:00<?, ?it/s]

10


  0%|          | 0/3310 [00:00<?, ?it/s]

11


  0%|          | 0/3310 [00:00<?, ?it/s]

12


  0%|          | 0/3310 [00:00<?, ?it/s]

13


  0%|          | 0/3310 [00:00<?, ?it/s]

14


  0%|          | 0/3310 [00:00<?, ?it/s]

15


  0%|          | 0/3310 [00:00<?, ?it/s]

16


  0%|          | 0/3310 [00:00<?, ?it/s]

17


  0%|          | 0/3310 [00:00<?, ?it/s]

18


  0%|          | 0/3310 [00:00<?, ?it/s]

19


  0%|          | 0/3310 [00:00<?, ?it/s]

20


  0%|          | 0/3309 [00:00<?, ?it/s]

21


  0%|          | 0/3309 [00:00<?, ?it/s]

22


  0%|          | 0/3309 [00:00<?, ?it/s]

23


  0%|          | 0/3309 [00:00<?, ?it/s]

24


  0%|          | 0/3309 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [6]:
import pandas as pd

df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                          'foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three',
                          'two', 'two', 'one', 'three']})

In [7]:
import numpy as np
np.array_split(df, 3)

[     A    B
 0  foo  one
 1  bar  one
 2  foo  two,
      A      B
 3  bar  three
 4  foo    two
 5  bar    two,
      A      B
 6  foo    one
 7  foo  three]

In [9]:
170_000 /100

1700.0