#Exercise for multiclass classification

#Check for GPU

In [2]:
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-10bb6167-0f81-0360-ed99-971ea5fe3cdd)


#Imports

In [3]:
import os

import tensorflow as tf

#Get dataset

In [4]:
path = tf.keras.utils.get_file('stack-overflow-questions',
                  'https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz',
                  untar=True)

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz


#Walkthrough dataset folders


In [5]:
dataset_dir = os.path.dirname(path)
os.listdir(dataset_dir)



['README.md', 'stack-overflow-questions.tar.gz', 'train', 'test']

In [6]:
train_dir = os.path.join(dataset_dir, 'train')

os.listdir(train_dir)

['python', 'javascript', 'csharp', 'java']

In [7]:
test_dir = os.path.join(dataset_dir, 'test')

#Use text_dataset_from_directory

In [8]:
seed=42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(train_dir,
                                                          seed=seed,
                                                          validation_split=0.2,
                                                          subset='training')

Found 8000 files belonging to 4 classes.
Using 6400 files for training.


In [9]:
val_ds = tf.keras.utils.text_dataset_from_directory(train_dir,
                                                          seed=seed,
                                                          validation_split=0.2,
                                                          subset='validation')

Found 8000 files belonging to 4 classes.
Using 1600 files for validation.


In [10]:
test_ds = tf.keras.utils.text_dataset_from_directory(test_dir)

Found 8000 files belonging to 4 classes.


#Look Through Dataset

In [11]:
for text, label in raw_train_ds.take(1):
  for i in range(3):
      print(text.numpy()[i])
      print(label.numpy()[i])

b'"my tester is going to the wrong constructor i am new to programming so if i ask a question that can be easily fixed, please forgive me. my program has a tester class with a main. when i send that to my regularpolygon class, it sends it to the wrong constructor. i have two constructors. 1 without perameters..public regularpolygon().    {.       mynumsides = 5;.       mysidelength = 30;.    }//end default constructor...and my second, with perameters. ..public regularpolygon(int numsides, double sidelength).    {.        mynumsides = numsides;.        mysidelength = sidelength;.    }// end constructor...in my tester class i have these two lines:..regularpolygon shape = new regularpolygon(numsides, sidelength);.        shape.menu();...numsides and sidelength were declared and initialized earlier in the testing class...so what i want to happen, is the tester class sends numsides and sidelength to the second constructor and use it in that class. but it only uses the default constructor, w

In [12]:
raw_train_ds.class_names

['csharp', 'java', 'javascript', 'python']

#Text vectorization

##Custom Standardization

In [13]:
import re
import string

def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

In [14]:
text_vectorizer = tf.keras.layers.TextVectorization(
     standardize=custom_standardization,
    max_tokens=10000,
    output_sequence_length=250
)

train_text = raw_train_ds.map(lambda x, y: x)

text_vectorizer.adapt(train_text)

In [15]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return text_vectorizer(text), label

In [16]:
train_ds = raw_train_ds.map(vectorize_text)
val_ds = val_ds.map(vectorize_text)
test_ds = test_ds.map(vectorize_text)

#Perf

In [17]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)


#Create the model

#Create embedding layer

In [18]:
embeddings = tf.keras.layers.Embedding(input_dim=10000+1,output_dim=16)

In [28]:
from tensorflow.keras import layers

num_classes=4

# inputs = layers.Input(shape=(1,))
# x=text_vectorizer(inputs)
# x=embeddings(x)
# x=layers.Dropout(0.2)(x)
# x=layers.GlobalAveragePooling1D()(x)
# x=layers.Dropout(0.2)(x)
# outputs=layers.Dense(num_classes, activation='softmax')(x)
# model=tf.keras.Model(inputs, outputs)

model= tf.keras.Sequential([
    embeddings,
    layers.Dropout(0.2),
    layers.GlobalAveragePooling1D(),
    layers.Dropout(0.2),
    layers.Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam', 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(), 
              metrics=['accuracy'])

In [29]:
model.fit(train_ds, validation_data=val_ds, epochs=10)

Epoch 1/10


ValueError: ignored

#Evaluate the model

#Plot loss curves
