In [1]:
import pathlib
import random
import string
import tensorflow.strings as tf_strings
import tensorflow.data as tf_data
import re
from tensorflow.keras.layers import TextVectorization
import keras
import tensorflow as tf
from keras import layers
import json

### TO see the GPU in system

In [4]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 4337202624739111990
xla_global_id: -1
]


##### Download And Prepare the File(Data set)
source :"http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"

Purpose: This block of code downloads a zip file containing the parallel corpus from a specified URL and extracts it.

Parameters:

fname: The local filename to save the downloaded file as.
origin: The URL from which to download the file.
extract: A boolean value indicating whether to extract the contents of the zip file.
Example:
The zip file spa-eng.zip contains parallel text data in English and Spanish. After extraction, the contents will be available in the directory where the zip file was downloaded.

In [14]:
text_file = keras.utils.get_file(
    fname= "spa-eng.zip",
    origin="http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
    extract=True
    
)
# Creating the Path to the Extracted Text File

text_file=pathlib.Path(text_file).parent / "spa-eng" / "spa.txt"
print(f"Path to the text file: {text_file}")

Path to the text file: C:\Users\brije\.keras\datasets\spa-eng\spa.txt


#### Reading and Splitting the File into Lines

Purpose: Reads the entire content of the text file and splits it into individual lines.

Explanation:

open(text_file, "r"): Opens the text file in read mode.
f.read(): Reads the entire file content as a single string.
.split("\n"): Splits the string into a list of lines based on the newline character.
[:-1]: Removes the last element of the list if it is an empty string.

In [15]:
with open(text_file, "r") as f:
    lines = f.read().split("\n")[:-1]
    

#### Creating Pairs of English and Spanish Sentences
Purpose: Processes each line to create pairs of English and Spanish sentences, with special tokens added to the Spanish sentences.

In [16]:
text_pairs = []

for line in lines:
    eng, spa = line.split("\t")
    spa = "[start] " + spa + " [end]"
    text_pairs.append((eng, spa))

Explanation:

text_pairs = []: Initializes an empty list to store the sentence pairs.

for line in lines: Iterates over each line in the lines list.

eng, spa = line.split("\t"): Splits each line into English and Spanish sentences based on the tab character.

spa = "[start] " + spa + " [end]": Adds the [start] and [end] tokens to the Spanish sentence.

text_pairs.append((eng, spa)): Adds the pair of sentences to the text_pairs list.

example :

["Go.\tVe.", "Hi.\tHola.", "Run.\tCorre."]

result:

[("Go.", "[start] Ve. [end]"), ("Hi.", "[start] Hola. [end]"), ("Run.", "[start] Corre. [end]")]


#### Shuffle the words randomly
Shuffling the text pairs using random.shuffle helps ensure that the data is randomly distributed, which can improve the training process for machine learning models

In [18]:
#shuffle the words to improve the training process

random.shuffle(text_pairs)
print("Successfully created and shuffled text pairs.")

Successfully created and shuffled text pairs.


In [20]:
#Print Example Pairs

for i in range(5):
    print(text_pairs[i])

('I was burned up by what he said.', '[start] Me tiene ardido con lo que me dijo. [end]')
("He's stronger than you.", '[start] Él es más fuerte que tú. [end]')
('That guy has a screw loose!', '[start] ¡Ese tipo tiene suelto un tornillo! [end]')
('I need your cooperation.', '[start] Necesito tu cooperación. [end]')
('Have you ever spent any time in Boston?', '[start] ¿Alguna vez has pasado tiempo en Boston? [end]')


#### structure of data set 


In [22]:
# Determine the number of validation samples, which is 15% of the total text pairs
num_val_samples = int(0.15 * len(text_pairs))

# Calculate the number of training samples
# The training samples are the remaining pairs after accounting for validation and test samples
# Since we want the validation and test sets to be the same size, we multiply num_val_samples by 2 and subtract it from the total
num_train_samples = len(text_pairs) - 2 * num_val_samples

# Split the text pairs into training, validation, and test sets
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples:num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples:]

# Print out the number of pairs in each set
print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")


118964 total pairs
83276 training pairs
17844 validation pairs
17844 test pairs


#### EXPLANATION


##### len(text_pairs):   
Gets the total number of text pairs.

0.15 * len(text_pairs): Calculates 15% of the total text pairs.

int(...): Converts the result to an integer (since the number of samples must be a whole number).

This gives the number of samples to be used for validation.

###

##### 2 * num_val_samples: 
Since we want the validation and test sets to be of equal size, we multiply the number of validation samples by 2.

len(text_pairs) - 2 * num_val_samples: Subtracts the combined number of validation and test samples from the total number of text pairs.

This gives the number of samples to be used for training.

train_pairs = text_pairs[:num_train_samples]: Takes the first num_train_samples pairs for training.

val_pairs = text_pairs[num_train_samples:num_train_samples + num_val_samples]: Takes the next num_val_samples pairs for validation.

test_pairs = text_pairs[num_train_samples + num_val_samples:]: Takes the remaining pairs for testing.