<a href="https://colab.research.google.com/github/AlejandroMagdaleno/Natural-Language-Processing-CS4742/blob/main/NLP_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
import os
import re
import shutil
import string
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing


data_dir = '/content/drive/MyDrive/NEWS'

train_dir = os.path.join(data_dir, 'train')

batch_size = 32
seed = 42


raw_train_dataset = tf.keras.preprocessing.text_dataset_from_directory('/content/drive/MyDrive/NEWS/train', 
                                                                       batch_size = batch_size,
                                                                       seed=seed)


raw_test_dataset = tf.keras.preprocessing.text_dataset_from_directory('/content/drive/MyDrive/NEWS/test',
                                                                      batch_size=batch_size)

Found 88 files belonging to 2 classes.
Found 25 files belonging to 2 classes.


In [24]:
max_features = 10000
sequence_length = 250

vectorize_layer = layers.TextVectorization(max_tokens = max_features,
                                           output_mode = 'int',
                                           output_sequence_length = sequence_length)

train_text = raw_train_dataset.map(lambda x, y: x)
vectorize_layer.adapt(train_text) ## Calling adapt to create a vocabulary and frequency from values in the data. This is Tensorflows implementation of embedding



In [25]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label
  
train_data = raw_train_dataset.map(vectorize_text)
test_data = raw_test_dataset.map(vectorize_text)

AUTOTUNE = tf.data.AUTOTUNE         ## OPTIONAL, we are prefetching the next data while we are training on current step x to speed process. 
                                    ## Autotune will set the number of elements to prefetch which should ideally be the number of batches consumed. In our case 32
train_data = train_data.cache().prefetch(buffer_size=AUTOTUNE)
test_data = test_data.cache().prefetch(buffer_size=AUTOTUNE)

In [33]:
embedding_dim = 16 ##Embedding layer dimsensions for vocabulary

## Customize neural network for complexity 
model = tf.keras.Sequential([
  layers.Embedding(max_features + 1, embedding_dim),
  layers.Dropout(0.2),   
  layers.Dense(5),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(5),
  layers.Dense(1)]) ## Has to stay as one due to binary classification. Only need one neuron for final layer

model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 16)          160016    
_________________________________________________________________
dropout_6 (Dropout)          (None, None, 16)          0         
_________________________________________________________________
dense_4 (Dense)              (None, None, 5)           85        
_________________________________________________________________
global_average_pooling1d_3 ( (None, 5)                 0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 5)                 0         
_________________________________________________________________
dense_5 (Dense)              (None, 5)                 30        
_________________________________________________________________
dense_6 (Dense)              (None, 1)                

In [34]:
model.compile(loss=losses.BinaryCrossentropy(from_logits=True),optimizer='adam',metrics=tf.metrics.BinaryAccuracy(threshold=0.0))

In [35]:
epochs = 50
history = model.fit(
    train_data,
    epochs=epochs)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [29]:
loss, accuracy = model.evaluate(test_data)
print("Loss: ", loss)
print("Accuracy: ", accuracy)

Loss:  0.5813011527061462
Accuracy:  0.7200000286102295
