## Prepare Environment

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Wed Apr 23 04:39:17 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   72C    P8             12W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 54.8 gigabytes of available RAM

You are using a high-RAM runtime!


In [None]:
!pip install datasets



In [None]:
# dependency disaster trying to find the right numpy version for 3 hours
!pip install --upgrade numpy==2.0.2 keras




## Prepare Models

In [None]:
from datasets import load_dataset, Dataset, load_from_disk, concatenate_datasets
import pandas as pd
import nltk
from tqdm import tqdm
import time

In [None]:
folder_prefix = "drive/MyDrive/rus_news_classifier_translations/"
name = 'train'

In [None]:
ds = load_from_disk(f"{folder_prefix}translated_{name}_dataset_dehallucinated")
# ds.to_pandas()
print(ds.column_names)
categories_translator = {
  0: 'climate',
  1: 'conflicts',
  2: 'Madaniyat', # culture
  3: 'Iqtisodiyot', # economy
  4: 'gloss',
  5: 'Salomatlik', # health
  6: 'Siyosat', # politics
  7: 'science',
  8: 'society',
  9: 'Sport', # sports
  10: 'travel'
}

['news', 'labels', 'news_cleaned', 'news_cleaned_dehalluc']


The following (heavily-modified) code is copied from the VDCNN repository on GitHub.

In [None]:
# util.py
import numpy as np
import tensorflow as tf

# for testing purposes :)
MAX_ENTRIES_PER_CATEGORY = 10000
DATASET = 2 # 0 for sms, 1 for uzbek, 2 for combined uzbek-russian
NUM_CLASSES = [2, 15, 21][DATASET]
MODEL = "VDCNN9"
MAPPING =  [{'ham': 0, 'spam': 1},
{
  'Avto': 0,
  'Dunyo': 1,
  'Iqtisodiyot': 2,
  'Jinoyat': 3,
  'O‘zbekiston': 4,
  'Qonunchilik': 5,
  'Siyosat': 6,
  'Texnologiya': 7,
  'Ayollar': 8,
  'Foto': 9,
  'Jamiyat': 10,
  'Madaniyat': 11,
  'Pazandachilik': 12,
  'Salomatlik': 13,
  'Sport': 14
},
{
  'Avto': 0,
  'Dunyo': 1,
  'Iqtisodiyot': 2,
  'Jinoyat': 3,
  'O‘zbekiston': 4,
  'Qonunchilik': 5,
  'Siyosat': 6,
  'Texnologiya': 7,
  'Ayollar': 8,
  'Foto': 9,
  'Jamiyat': 10,
  'Madaniyat': 11,
  'Pazandachilik': 12,
  'Salomatlik': 13,
  'Sport': 14,
  'climate': 15,
  'conflicts': 16,
  'gloss': 17,
  'science': 18,
  'society': 19,
  'travel': 20
}
][DATASET]


def load_spam(rdr):
    sentence = [] # the sentences to be classified
    label = [] # strings mapped to category names
    for idx, line in enumerate(rdr):
        # print(line[0])
        if idx == 0:
            continue
        if len(line[1]) == 0:
            continue
        etc = []
        for i in line[1]:
            etc.append(i.lower())
        sentence.append(etc)
        label.append(line[0].strip())
    return sentence, label

def load_uzbek(text_files_by_category):
    sentence = [] # the sentences to be classified
    label = [] # strings mapped to category names

    # for uzbek
    for category, files in text_files_by_category.items():
      print(f"Category: {category}, Number of files: {len(files)}")
      for idx, file in enumerate(files):
        if idx >= MAX_ENTRIES_PER_CATEGORY:
          break
        with open(file, 'r', encoding='utf-8-sig') as f:
          etc = []
          for line in f:
            etc.append(line.lower().strip())
          sentence.append(etc)
          label.append(category)
    return sentence, label

def load_combined(text_files_by_category):
    sentence = []
    label = []
    ds = load_from_disk(f"{folder_prefix}translated_{name}_dataset_dehallucinated")
    for row in ds:
      if len(row['news_cleaned_dehalluc']) > 250:
        sentence.append(row['news_cleaned_dehalluc'])
        label.append(categories_translator[row['labels']])
    s2, l2 = load_uzbek(text_files_by_category)
    print(len(label), len(l2))
    sentence.extend(s2)
    label.extend(l2)
    print(len(label))
    return sentence, label

load_data = [load_spam, load_uzbek, load_combined][DATASET]


def tensor_transform(X, y, batch_size):
    X, y = np.array(X), np.array(y)
    data = tf.data.Dataset.from_tensor_slices((X, y))
    data = data.batch(batch_size)

    return data


def word_to_index(vocab_index, data):
    X = []
    for i in data:
        etc = []
        for j in i:
            if j in vocab_index.keys():
                etc.append(vocab_index[j])
            else:
                etc.append(vocab_index['OOV'])
        X.append(etc)

    return X

In [None]:
# models.py
import tensorflow as  tf
from tensorflow.keras import layers
from tensorflow.keras.models import Model


class ZeroPadding(layers.Layer):
    def __init__(self, values):
        super(ZeroPadding, self).__init__()
        self.values = values

    def call(self, x):
        x = tf.pad(x, [[0, 0], [0, 0], [self.values[0], self.values[1]]], mode='CONSTANT', constant_values=0)
        return x


class ConvBlock(layers.Layer):
    def __init__(self, filters=256, kernel_size=3, padding='same', pool=False):
        super(ConvBlock, self).__init__()
        self.filters = filters
        self.kernel_size = kernel_size
        self.padding = padding
        self.pool = pool

        self.conv1 = layers.Conv1D(self.filters, self.kernel_size, strides=1, padding=self.padding)
        self.bn1 = layers.BatchNormalization()
        self.relu1 = layers.Activation('relu')

        self.conv2 = layers.Conv1D(self.filters, self.kernel_size, strides=1, padding=self.padding)
        self.bn2 = layers.BatchNormalization()
        self.relu2 = layers.Activation('relu')

        self.shortcut_conv = layers.Conv1D(self.filters, self.kernel_size, strides=2, padding=self.padding)
        self.shortcut_bn = layers.BatchNormalization()
        self.shortcut_pad = layers.MaxPooling1D(pool_size=kernel_size, strides=2, padding='same')
        self.shortcut_add = layers.Add()
        self.zero_padding = ZeroPadding([int(self.filters // 2), self.filters - int(self.filters // 2)])

    def call(self, inputs):
        cnn1 = self.conv1(inputs)
        cnn1 = self.bn1(cnn1)
        cnn1 = self.relu1(cnn1)

        cnn2 = self.conv2(cnn1)
        cnn2 = self.bn2(cnn2)
        cnn2 = self.relu2(cnn2)

        if self.pool:
            downsample = self.shortcut_conv(cnn2)
            downsample = self.shortcut_bn(downsample)
            conv_pool = self.shortcut_pad(cnn2)
            conv_shortcut = self.shortcut_add([downsample, conv_pool])
            conv_project = self.zero_padding(conv_shortcut)
            return conv_project

        else:
            conv_shortcut = self.shortcut_add([cnn2, inputs])
            return conv_shortcut


class K_Max_Pooling(layers.Layer):
    def __init__(self, k):
        super(K_Max_Pooling, self).__init__()
        self.k = k

    def call(self, inputs):
        input_transpose = layers.Permute((2, 1))(inputs)
        top_k, _ = tf.math.top_k(input_transpose, k=self.k, sorted=False)
        top_k = layers.Permute((2, 1))(top_k)
        return top_k


def VDCNN(self, depth):
    model_depth = {9: [1, 1, 1, 1], 17: [2, 2, 2, 2], 29: [5, 5, 2, 2], 49: [8, 8, 5, 3]}

    inputs = layers.Input((self.max_len,))
    embedding = layers.Embedding(self.vocab_size, self.embedding_size)(inputs)

    temp_conv_64 = layers.Conv1D(filters=64, kernel_size=self.kernel_size, strides=1, padding='same')(embedding)
    for i in range(model_depth[depth][0] - 1):  # 64
        temp_conv_64 = ConvBlock(filters=64, kernel_size=self.kernel_size)(temp_conv_64)
    temp_conv_128 = ConvBlock(filters=64, kernel_size=3, pool=True)(temp_conv_64)

    for i in range(model_depth[depth][1] - 1):  # 128
        temp_conv_128 = ConvBlock(filters=128, kernel_size=self.kernel_size)(temp_conv_128)
    temp_conv_256 = ConvBlock(filters=128, kernel_size=self.kernel_size, pool=True)(temp_conv_128)

    for i in range(model_depth[depth][2] - 1):  # 256
        temp_conv_256 = ConvBlock(filters=256, kernel_size=self.kernel_size)(temp_conv_256)
    temp_conv_512 = ConvBlock(filters=256, kernel_size=self.kernel_size, pool=True)(temp_conv_256)

    for i in range(model_depth[depth][3] - 1):  # 512
        temp_conv_512 = ConvBlock(filters=512, kernel_size=self.kernel_size)(temp_conv_512)
    temp_conv_512 = ConvBlock(filters=512, kernel_size=self.kernel_size, pool=True)(temp_conv_512)

    output = K_Max_Pooling(k=self.k)(temp_conv_512)

    output = layers.Flatten()(output)

    output = layers.Dense(2048, activation='relu')(output)

    output = layers.Dense(2048, activation='relu')(output)

    output = layers.Dense(self.class_num, activation='softmax')(output)

    model = Model(inputs=inputs, outputs=output)

    model.compile(loss=self.loss, optimizer=self.opt, metrics=['accuracy'])

    model.summary()

    return model

In [None]:
#model_train.py
# import models
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf


class VDCNN_classification(object):
    def __init__(self, model_name='VDCNN29', max_len=1024, vocab_size=10000, embedding_size=16,
                kernel_size=3, k=8, class_num=2, opt=tf.keras.optimizers.SGD(), batch_size=128,
                epochs=100, loss='binary_crossentropy', train_data=None, val_data=None, test_data=None):
        self.model_name = model_name
        self.max_len = max_len
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.kernel_size = kernel_size
        self.k = k
        self.class_num = class_num
        self.opt = opt
        self.batch_size = batch_size
        self.epochs = epochs
        self.loss = loss
        self.train_data = train_data
        self.val_data = val_data
        self.test_data = test_data

    def train(self):
        if self.model_name == 'VDCNN9':
            model = VDCNN(self, depth=9)
        elif self.model_name == 'VDCNN17':
            model = VDCNN(self, depth=17)
        elif self.model_name == 'VDCNN29':
            model = VDCNN(self, depth=29)
        elif self.model_name == 'VDCNN49':
            model = VDCNN(self, depth=49)

        es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4, restore_best_weights=True)

        model.fit(self.train_data, epochs=self.epochs, batch_size=self.batch_size,
                  validation_data=self.val_data, callbacks=[es])

        test_loss, test_acc = model.evaluate(self.test_data)

        print("TEST Loss : {:.6f}".format(test_loss))
        print("TEST ACC : {:.6f}".format(test_acc))

        return model

## Examine Sample SMS Spam dataset from VDCNN Paper

In [None]:
import requests
import zipfile
import io

# Download the zip file
url = "https://www.kaggle.com/api/v1/datasets/download/uciml/sms-spam-collection-dataset"
response = requests.get(url)
response.raise_for_status()  # Raise an exception for bad status codes

# Extract the zip file contents
with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
    zip_ref.extractall("sms-spam")

print("Download and extraction complete.")

Download and extraction complete.


In [None]:
!ls sms-spam

spam.csv


In [None]:
! cat sms-spam/spam.csv

v1,v2,,,
ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...",,,
ham,Ok lar... Joking wif u oni...,,,
spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's,,,
ham,U dun say so early hor... U c already then say...,,,
ham,"Nah I don't think he goes to usf, he lives around here though",,,
spam,"FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, �1.50 to rcv",,,
ham,Even my brother is not like to speak with me. They treat me like aids patent.,,,
ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune,,,
spam,WINNER!! As a valued network customer you have been selected to receivea �900 prize reward! To claim call 0906170146

## Download Uzbek dataset

In [None]:
import requests
import zipfile
import io

In [None]:
# Download the zip file
url = "https://zenodo.org/records/7677431/files/Uzbek_News_Dataset.zip?download=1"
response = requests.get(url)
response.raise_for_status()  # Raise an exception for bad status codes
print("Download complete.")

# Extract the zip file contents
with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
    zip_ref.extractall("Uzbek_News_Dataset")

print("Download and extraction complete.")


Download complete.
Download and extraction complete.


In [None]:
!ls Uzbek_News_Dataset/Uzbek_News_Dataset
!cat Uzbek_News_Dataset/Uzbek_News_Dataset/Jinoyat/1557.txt

Avto	 Foto	      Jinoyat	   Pazandachilik  Siyosat
Ayollar  Iqtisodiyot  Madaniyat    Qonunchilik	  Sport
Dunyo	 Jamiyat      O‘zbekiston  Salomatlik	  Texnologiya
Navoiyda hokimni sudga bergan fermer sud majlisi belgilangan kunga etmay sirli o`lim topdi

Ijtimoiy tarmoqlarda Navoiy viloyatining Xatirchi tumanida mahalliy fermerlardan biri sirli o`lim topgani aytilmoqda. Unga ko`ra, fermer tuman hokimining unga berilgan er maydoniga oid ijara shartnomasini bekor qilish haqidagi qarori ustidan sudga murojaat qilgan. Biroq sud majlisi belgilangan sanaga etmay fermerning jasadi tumanning Polvonota MFY hududidan o`tgan ariqdan topilgan.
Voqelik bo`yicha «Xabar.uz» so`roviga javob bergan, Bosh prokuratura matbuot xizmati rahbari Hayot Shamsutdinov ma`lum qilishicha, fermerning jasadi joriy yilning 5 noyabr kuni ariqdan (tan jarohatlarisiz) topilganini tasdiqlagan. Shuningdek, holat yuzasidan tergovga qadar tekshiruv harakatlari o`tkazilib, natijasiga ko`ra, jinoyat ishi qo`zg`atishni rad qi

In [None]:
import os

def crawl_text_files(root_dir):
    """Crawls through subfolders and creates a dictionary of file paths."""
    file_dict = {}
    for subdir, _, files in os.walk(root_dir):
        subdir_name = os.path.basename(subdir)
        txt_files = [os.path.join(subdir, file) for file in files if file.endswith(".txt")]
        if txt_files:  # Only add subdirs with .txt files
          file_dict[subdir_name] = txt_files
    return file_dict

# Example usage:
root_directory = "Uzbek_News_Dataset/Uzbek_News_Dataset"
text_files_by_category = crawl_text_files(root_directory)

# Print or process the dictionary
for category, files in text_files_by_category.items():
    print(f"Category: {category}, Number of files: {len(files)}")
    # You can now access the list of files for each category
    # For example, print the first file in the list:
    #if files:
        #print(f"First file: {files[0]}")


Category: Iqtisodiyot, Number of files: 12165
Category: Jamiyat, Number of files: 55018
Category: Ayollar, Number of files: 2657
Category: Foto, Number of files: 4037
Category: Madaniyat, Number of files: 12798
Category: Pazandachilik, Number of files: 2040
Category: Qonunchilik, Number of files: 33089
Category: Dunyo, Number of files: 136732
Category: Jinoyat, Number of files: 4200
Category: Texnologiya, Number of files: 17541
Category: O‘zbekiston, Number of files: 149312
Category: Sport, Number of files: 59784
Category: Salomatlik, Number of files: 5086
Category: Avto, Number of files: 6044
Category: Siyosat, Number of files: 12247


## Run Evaluation

In [None]:
#eval.py
import csv
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
# from keras.src.utils.numerical_utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
# from model_train import VDCNN_classification
import os
# import util
# from tensorflow.keras.mixed_precision import experimental as mixed_precision

print(tf.config.list_physical_devices('GPU'))
# tf.config.optimizer.set_jit(False)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [None]:
tf.compat.v1.enable_eager_execution() # enable eager execution to allow for converting TF tensors to numpy arrays

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# strategy = tf.distribute.MirroredStrategy(devices=["GPU:1"])

# file = open('sms-spam/spam.csv', 'r', encoding='Windows-1252') # NOT utf-8-sig

# rdr = csv.reader(file)

# X, y = load_data(rdr)

X, y = load_data(text_files_by_category)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

vocab = set()
for i in X_train:
    vocab.update(i)

vocab_index = {}

for i in vocab:
    vocab_index[i] = len(vocab_index) + 1

vocab_index['OOV'] = len(vocab_index) + 1

train_X, val_X, test_X = word_to_index(vocab_index, X_train), word_to_index(vocab_index, X_val),\
                         word_to_index(vocab_index, X_test)

max_len = max([len(i) for i in train_X])
vocab_size = len(vocab_index)

X_train = pad_sequences(train_X, padding='post', maxlen=max_len)
X_val = pad_sequences(val_X, padding='post', maxlen=max_len)
X_test = pad_sequences(test_X, padding='post', maxlen=max_len)


# Create a mapping from labels to numerical indices
label_mapping = MAPPING

# Convert labels to numerical indices
y_train_encoded = np.array([label_mapping[label] for label in y_train])
y_val_encoded = np.array([label_mapping[label] for label in y_val])
y_test_encoded = np.array([label_mapping[label] for label in y_test])

# Now use to_categorical on the encoded labels
y_train = to_categorical(y_train_encoded) # dtype='int64'
y_val = to_categorical(y_val_encoded) # dtype='int64'
y_test = to_categorical(y_test_encoded) # dtype='int64'


batch_size = 512

train_data = tensor_transform(X_train, y_train, batch_size)
val_data = tensor_transform(X_val, y_val, batch_size)
test_data = tensor_transform(X_test, y_test, batch_size)

# Prefetch data to overlap data loading with model execution
train_data = train_data.prefetch(tf.data.AUTOTUNE)
val_data = val_data.prefetch(tf.data.AUTOTUNE)
test_data = test_data.prefetch(tf.data.AUTOTUNE)

options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF

train_data = train_data.with_options(options)
val_data = val_data.with_options(options)
test_data = test_data.with_options(options)

model_name = MODEL # input('Model Input : [VDCNN9, VDCNN17, VDCNN29, VDCNN49]  ')

Category: Iqtisodiyot, Number of files: 12165
Category: Jamiyat, Number of files: 55018
Category: Ayollar, Number of files: 2657
Category: Foto, Number of files: 4037
Category: Madaniyat, Number of files: 12798
Category: Pazandachilik, Number of files: 2040
Category: Qonunchilik, Number of files: 33089
Category: Dunyo, Number of files: 136732
Category: Jinoyat, Number of files: 4200
Category: Texnologiya, Number of files: 17541
Category: O‘zbekiston, Number of files: 149312
Category: Sport, Number of files: 59784
Category: Salomatlik, Number of files: 5086
Category: Avto, Number of files: 6044
Category: Siyosat, Number of files: 12247
1498 114064
115562


In [None]:
# with strategy.scope(): # commented out to put in eager execution mode
vdcnn = VDCNN_classification(model_name=model_name, max_len=max_len, vocab_size=vocab_size, batch_size=batch_size,
                              train_data=train_data, val_data=val_data, test_data=test_data, class_num=NUM_CLASSES)

with tf.device('/GPU:0'):
    trained_model = vdcnn.train()

Epoch 1/100
[1m181/181[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1744s[0m 9s/step - accuracy: 0.1168 - loss: 0.2277 - val_accuracy: 0.0548 - val_loss: 0.6800
Epoch 2/100
[1m181/181[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1561s[0m 9s/step - accuracy: 0.1864 - loss: 0.1635 - val_accuracy: 0.0742 - val_loss: 0.6037
Epoch 3/100
[1m181/181[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1570s[0m 9s/step - accuracy: 0.2138 - loss: 0.1552 - val_accuracy: 0.1447 - val_loss: 0.3139
Epoch 4/100
[1m 61/181[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m15:34[0m 8s/step - accuracy: 0.2285 - loss: 0.1512

In [None]:
model_path = f'/content/drive/My Drive/rus_news_classifier_translations/{MODEL}_data_{DATASET}.weights.keras'
trained_model.save(model_path)


print(f"Model weights saved to: {model_path}")


## Compare to Normal Uzbek Run

In [None]:
tf.compat.v1.enable_eager_execution() # enable eager execution to allow for converting TF tensors to numpy arrays

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# strategy = tf.distribute.MirroredStrategy(devices=["GPU:1"])

# file = open('sms-spam/spam.csv', 'r', encoding='Windows-1252') # NOT utf-8-sig

# rdr = csv.reader(file)

# X, y = load_data(rdr)

X, y = load_data(text_files_by_category)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

vocab = set()
for i in X_train:
    vocab.update(i)

vocab_index = {}

for i in vocab:
    vocab_index[i] = len(vocab_index) + 1

vocab_index['OOV'] = len(vocab_index) + 1

train_X, val_X, test_X = word_to_index(vocab_index, X_train), word_to_index(vocab_index, X_val),\
                         word_to_index(vocab_index, X_test)

max_len = max([len(i) for i in train_X])
vocab_size = len(vocab_index)

X_train = pad_sequences(train_X, padding='post', maxlen=max_len)
X_val = pad_sequences(val_X, padding='post', maxlen=max_len)
X_test = pad_sequences(test_X, padding='post', maxlen=max_len)


# Create a mapping from labels to numerical indices
label_mapping = MAPPING

# Convert labels to numerical indices
y_train_encoded = np.array([label_mapping[label] for label in y_train])
y_val_encoded = np.array([label_mapping[label] for label in y_val])
y_test_encoded = np.array([label_mapping[label] for label in y_test])

# Now use to_categorical on the encoded labels
y_train = to_categorical(y_train_encoded) # dtype='int64'
y_val = to_categorical(y_val_encoded) # dtype='int64'
y_test = to_categorical(y_test_encoded) # dtype='int64'


batch_size = 256

train_data = tensor_transform(X_train, y_train, batch_size)
val_data = tensor_transform(X_val, y_val, batch_size)
test_data = tensor_transform(X_test, y_test, batch_size)

options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF

train_data = train_data.with_options(options)
val_data = val_data.with_options(options)
test_data = test_data.with_options(options)

model_name = MODEL # input('Model Input : [VDCNN9, VDCNN17, VDCNN29, VDCNN49]  ')

# with strategy.scope(): # commented out to put in eager execution mode
vdcnn = VDCNN_classification(model_name=model_name, max_len=max_len, vocab_size=vocab_size, batch_size=batch_size,
                              train_data=train_data, val_data=val_data, test_data=test_data, class_num=NUM_CLASSES)

with tf.device('/GPU:0'):
    trained_model = vdcnn.train()

In [None]:
import chardet
with open('sms-spam/spam.csv', 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))
result

{'encoding': 'Windows-1252', 'confidence': 0.7272080023536335, 'language': ''}