In [92]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re

from sklearn.datasets import fetch_20newsgroups

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [90]:
print('GPU name: ', tf.config.experimental.list_physical_devices('GPU'))

GPU name:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


2023-02-24 12:11:49.519375: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-24 12:11:49.519873: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-24 12:11:49.520302: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


Getting the data from the dataset fetch_20newsgroups.

To this project, was used just 10 of the 20 categories, more than that could take much longer to train the algorithms



### Importing data

In [None]:
categories = [
 'comp.graphics',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'talk.politics.misc',
 'talk.religion.misc']

In [74]:
train_data = fetch_20newsgroups(
    subset='train',
    categories=categories,
    shuffle=True,
    random_state=42
)
train_y = train_data.target

test_data = fetch_20newsgroups(
    subset='test', 
    categories=categories, 
    shuffle=True, 
    random_state=42
)
test_y = test_data.target

print(f"There are {len(train_data.data)} news for training")
print(f"There are {len(test_data.data)} news for testing")

There are 5578 news for training
There are 3714 news for testing


The data is in it's raw form, to don't input too much desnecessary data, it need to be cleaned.

So, remove some features of the news, such as the from and some not alphabetical chars

### Cleaning data

In [82]:
def adjust_data(data):
    """
    function to run in each string of the data set and apply the cleaning 
    
    Args:
        temp (string): variable to hold the values on each string while cleaning the dataset

    Returns:
        data (list of string) - lists containing the data cleaned
    """
    
    for i in range(len(data)):
        temp = data[i]
        temp = temp.split('\n')
        #removing the 'from' headline
        temp  = temp[1:]

        #removing empty indexes
        temp = [x for x in temp if x != '']

        #removing not alphabetical chars
        temp = [re.sub(r'/^[\w&.\-]+$/',' ',i) for i in temp]
    
        data[i] = ' '.join(temp)
    
    return data

Using the cleaning function and separating the data.

In [84]:
train_x = adjust_data(train_data.data)
test_x = adjust_data(test_data.data)

### Creating validation split

In [86]:
def train_val_split(News, labels):
    """
    Splits the dataset into training and validation sets
    split in 80% to training and 20% to validation
    
    Args:
        News (list of string): lower-cased News
        labels (list of string): list of labels
    
    Returns:
        train_x, val_x, train_y, val_y - lists containing the data splits
    """
    
    
    # Compute the number of News that will be used for training (should be an integer)
    training_size = int(len(News)*0.8)

    # Split the News and labels into train/validation splits
    train_x = News[0:training_size]
    train_y = labels[0:training_size]

    val_x = News[training_size:]
    val_y = labels[training_size:]
    
    return train_x, val_x, train_y, val_y

In [87]:
train_x, val_x, train_y, val_y = train_val_split(train_x, train_y)

### Tokenazing and padding data

To input the data in the algorithm it's needed to change the vacabulary data to a numeric data, so, keras tokenizer can help with that

after the tokenization, it's necessary a padding to every new have the same len.

In [96]:
token_dict =  {
    'NUM_WORDS': 1000,
    'PADDING' : 'post',
    'OOV_TOKEN' : '<OOV>'
    }

In [97]:
tokenizer = Tokenizer(num_words=token_dict['NUM_WORDS'], 
                        oov_token=token_dict['OOV_TOKEN'])

tokenizer.fit_on_texts(train_x)

word_index = tokenizer.word_index

print(f"Vocabulary contains {len(word_index)} words\n")

Vocabulary contains 56190 words



In [None]:
    sequences = tokenizer.texts_to_sequences(sentences)
    
    # Pad the sequences using the correct padding and maxlen
    padded_sequences = pad_sequences(sequences, maxlen=maxlen,
                                    padding=padding)