In [None]:
# I need tensorflow 2.3 version. Install it here and restart the runtime.
!pip install tf-nightly > install.log
# Make sure to restart runtime

In [None]:
# Download the dataset
! wget https://storage.googleapis.com/akhilez/datasets/singularity_systems/data.tgz

In [None]:
# Reload up the dataset
! rm -rf data
! tar -xzf data.tgz

In [None]:
!find data -type f -exec mv '{}' '{}'.txt \;

In [1]:
from tensorflow.keras.preprocessing import text_dataset_from_directory
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import csv

In [93]:
batch_size = 10
sequence_length = 100
max_char_length = 500
embedding_dim = 100
vocab_size = 10000
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

In [65]:
test_dataset = text_dataset_from_directory('data/test', label_mode='categorical', batch_size=batch_size, max_length=max_char_length, shuffle=False)

Found 7761 files belonging to 20 classes.


In [81]:
class Preprocessor:

    def run(self, sentence):
        sentence = self.remove_before_word(sentence, '@')
        sentence = sentence.replace("\\n", ' ')
        sentence = sentence.replace("\\", '')
        return sentence

    def remove_before_word(self, sentence, word):
        try:
            subject_index = sentence.index(word)
            sentence = sentence[subject_index + len(word):]
        except:
            print(f"HEY! No word '{word}' in {sentence[:30]}")
        return sentence

preprocessor = Preprocessor()

In [84]:
# Setup the tokenizer

tokenizer = Tokenizer(oov_token=oov_tok, num_words=vocab_size)

i = 0
for batch in test_dataset:
    texts, labels = batch
    clean_texts = [preprocessor.run(text.decode("utf-8")) for text in texts.numpy()]
    tokenizer.fit_on_texts(clean_texts)
    i += 1
    if i > 1:
        break

print(tokenizer.word_index)

{'<OOV>': 1, 'the': 2, 'edu': 3, 'to': 4, 'in': 5, 'of': 6, 're': 7, 'com': 8, 'writes': 9, 'subject': 10, 'lines': 11, 'organization': 12, 'a': 13, 'is': 14, 'article': 15, 'i': 16, 'you': 17, 'and': 18, 'that': 19, 'this': 20, 'university': 21, 'it': 22, 'for': 23, 'not': 24, 'be': 25, 'posting': 26, 'cs': 27, 'atheists': 28, 'nntp': 29, 'host': 30, 'at': 31, 'morality': 32, 'dsinc': 33, 'about': 34, 'god': 35, 'vice': 36, 'ico': 37, 'tek': 38, 'with': 39, 'what': 40, 'jaeger': 41, 'jim': 42, 'on': 43, 'was': 44, 'bu': 45, 'reply': 46, 'my': 47, 'as': 48, 'or': 49, 'uiuc': 50, 'are': 51, 'some': 52, 'me': 53, 'by': 54, 'they': 55, 'inc': 56, 'have': 57, 'has': 58, 'cobb': 59, 'distribution': 60, 'were': 61, 'amusing': 62, 'timmbake': 63, 'mcl': 64, 'uk': 65, 'beauchaine': 66, 'how': 67, 'would': 68, 'think': 69, 'but': 70, 'do': 71, 'perry': 72, 'robert': 73, 'so': 74, 'christian': 75, 'we': 76, 'washington': 77, 'from': 78, 'david': 79, 'nye': 80, 'he': 81, 'which': 82, 'timmons': 8

In [88]:
with open('test_data.csv', 'w') as data_file:
    writer = csv.writer(data_file)
    i = 0
    for batch in test_dataset:
        texts, labels = batch
        clean_texts = [preprocessor.run(text.decode("utf-8")) for text in texts.numpy()]
        sequences = tokenizer.texts_to_sequences(clean_texts)
        sequences = pad_sequences(sequences, sequence_length, padding=padding_type, truncating=trunc_type)
        writer.writerows(list(sequence) + list(label) for sequence, label in zip(sequences, labels))
        print(tokenizer.texts_to_sequences(['people might think it takes faith to be an atheist']))
        i += 1
        if i > 1:
            break
! cat test_data.csv

[[615, 335, 69, 22, 333, 97, 4, 25, 87, 616]]
[[615, 335, 69, 22, 333, 97, 4, 25, 87, 616]]
114,115,116,8,117,118,10,7,34,2,163,529,530,12,31,164,60,146,11,212,5,15,165,531,532,166,167,3,165,166,167,3,301,213,302,9,533,2,303,534,51,43,2,304,6,2,305,119,35,306,535,168,536,537,81,44,538,4,539,82,61,540,4,25,541,2,304,6,2,305,542,543,18,120,2,544,545,121,307,2,546,6,547,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"tf.Tensor(1.0, shape=(), dtype=float32)","tf.Tensor(0.0, shape=(), dtype=float32)","tf.Tensor(0.0, shape=(), dtype=float32)","tf.Tensor(0.0, shape=(), dtype=float32)","tf.Tensor(0.0, shape=(), dtype=float32)","tf.Tensor(0.0, shape=(), dtype=float32)","tf.Tensor(0.0, shape=(), dtype=float32)","tf.Tensor(0.0, shape=(), dtype=float32)","tf.Tensor(0.0, shape=(), dtype=float32)","tf.Tensor(0.0, shape=(), dtype=float32)","tf.Tensor(0.0, shape=(), dtype=float32)","tf.Tensor(0.0, shape=(), dtype=float32)","tf.Tensor(0.0, shape=(), dtype=float32)","tf.Tensor(0.0, shape=(), dtype=float32)","tf.Tens

In [92]:
with open('test_data.csv', 'r') as data_file:
    reader = csv.reader(data_file)
    for row in reader:
        print(len(row))


120
120
120
120
120
120
120
120
120
120
120
120
120
120
120
120
120
120
120
120


In [98]:
len(tokenizer.word_index)

1171