#Exercise for multiclass classification

#Check for GPU

In [67]:
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-b4055573-aaa1-a892-0265-86444ba7dc8e)


#Imports

In [68]:
import os

import tensorflow as tf

#Get dataset

In [69]:
path = tf.keras.utils.get_file('stack-overflow-questions',
                  'https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz',
                  untar=True)

#Walkthrough dataset folders


In [70]:
dataset_dir = os.path.dirname(path)
os.listdir(dataset_dir)



['test', 'stack-overflow-questions.tar.gz', 'README.md', 'train']

In [71]:
train_dir = os.path.join(dataset_dir, 'train')

os.listdir(train_dir)

['javascript', 'python', 'java', 'csharp']

In [72]:
test_dir = os.path.join(dataset_dir, 'test')

#Use text_dataset_from_directory

In [73]:
seed=42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(train_dir,
                                                          batch_size=32,
                                                          seed=seed,
                                                          validation_split=0.2,
                                                          subset='training')

Found 8000 files belonging to 4 classes.
Using 6400 files for training.


In [74]:
for text_batch, label_batch in raw_train_ds.take(1):
  for i in range(10):
    print("Question: ", text_batch.numpy()[i])
    print("Label:", label_batch.numpy()[i])

Question:  b'"my tester is going to the wrong constructor i am new to programming so if i ask a question that can be easily fixed, please forgive me. my program has a tester class with a main. when i send that to my regularpolygon class, it sends it to the wrong constructor. i have two constructors. 1 without perameters..public regularpolygon().    {.       mynumsides = 5;.       mysidelength = 30;.    }//end default constructor...and my second, with perameters. ..public regularpolygon(int numsides, double sidelength).    {.        mynumsides = numsides;.        mysidelength = sidelength;.    }// end constructor...in my tester class i have these two lines:..regularpolygon shape = new regularpolygon(numsides, sidelength);.        shape.menu();...numsides and sidelength were declared and initialized earlier in the testing class...so what i want to happen, is the tester class sends numsides and sidelength to the second constructor and use it in that class. but it only uses the default con

In [75]:
for i, label in enumerate(raw_train_ds.class_names):
  print("Label", i, "corresponds to", label)

Label 0 corresponds to csharp
Label 1 corresponds to java
Label 2 corresponds to javascript
Label 3 corresponds to python


In [76]:
val_ds = tf.keras.utils.text_dataset_from_directory(train_dir,
                                                          seed=seed,
                                                          batch_size=32,
                                                          validation_split=0.2,
                                                          subset='validation')

Found 8000 files belonging to 4 classes.
Using 1600 files for validation.


In [77]:
test_ds = tf.keras.utils.text_dataset_from_directory(test_dir, batch_size=32)

Found 8000 files belonging to 4 classes.


#Look Through Dataset

In [78]:
for text, label in raw_train_ds.take(1):
  for i in range(3):
      print(text.numpy()[i])
      print(label.numpy()[i])

b'"blank8 why is my solution faster than the neat solution? (hackerrank chocolate feast) edit: simplified my solution..edit: removed opinion based secondary question...background: atarted learning blank a week or two ago using hackerranks problems as exercises and stackoverflow search + google as my teacher, i\'ve had some limited experience learning other languages...i did the exercise my own ""noobish learner way"" which i can\'t help but feel is a ""botched job"" when i see ""neat &amp; short"" solutions...however, when submitting both solutions one after another a couple of times i found the ""neat"" solution was quite a bit slower. ..i vaguely remember something about % operations being costly, is mine faster because of no % operations or is there more to it than just that?..exercise: https://www.hackerrank.com/challenges/chocolate-feast..neat solution from discussion:..import blank.io.*;.import blank.util.*;..public class solution {.    static int cc; .    public static void main

In [79]:
raw_train_ds.class_names

['csharp', 'java', 'javascript', 'python']

#Text vectorization

##Custom Standardization

In [80]:
import re
import string

def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

In [81]:
text_vectorizer = tf.keras.layers.TextVectorization(
    output_mode='int',
    max_tokens=10000,
    output_sequence_length=250
)

train_text = raw_train_ds.map(lambda text, labels: text)

text_vectorizer.adapt(train_text)

In [82]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return text_vectorizer(text), label

In [83]:
# Retrieve a batch (of 32 reviews and labels) from the dataset.
text_batch, label_batch = next(iter(raw_train_ds))
first_question, first_label = text_batch[0], label_batch[0]
print("Question", first_question)
print("Label", first_label)

Question tf.Tensor(b'"function expected error in blank for dynamically created check box when it is clicked i want to grab the attribute value.it is working in ie 8,9,10 but not working in ie 11,chrome shows function expected error..&lt;input type=checkbox checked=\'checked\' id=\'symptomfailurecodeid\' tabindex=\'54\' style=\'cursor:pointer;\' onclick=chkclickevt(this);  failurecodeid=""1"" &gt;...function chkclickevt(obj) { .    alert(obj.attributes(""failurecodeid""));.}"\n', shape=(), dtype=string)
Label tf.Tensor(2, shape=(), dtype=int32)


In [84]:
print("'int' vectorized question:",
      vectorize_text(first_question, first_label)[0])

'int' vectorized question: tf.Tensor(
[[  38  450   65    7   16   12  892  265  186  451   44   11    6  685
     3   46    4 2062    2  485    1    6  158    7  479    1   26   20
   158    7  479    1  502   38  450    1 1767 1763    1    1    1    1
     1    1    1    1    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0

In [86]:
print("1289 ---> ", text_vectorizer.get_vocabulary()[1289])
print("313 ---> ", text_vectorizer.get_vocabulary()[313])
print("Vocabulary size: {}".format(len(text_vectorizer.get_vocabulary())))

1289 --->  roman
313 --->  source
Vocabulary size: 10000


In [87]:
train_ds = raw_train_ds.map(vectorize_text)
val_ds = val_ds.map(vectorize_text)
test_ds = test_ds.map(vectorize_text)

#Perf

In [89]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)


#Create the model

#Create embedding layer

In [93]:
embeddings = tf.keras.layers.Embedding(input_dim=10000+1,output_dim=64, mask_zero=True)

In [94]:
from tensorflow.keras import layers

num_classes=4

# inputs = layers.Input(shape=(1,))
# x=text_vectorizer(inputs)
# x=embeddings(x)
# x=layers.Dropout(0.2)(x)
# x=layers.GlobalAveragePooling1D()(x)
# x=layers.Dropout(0.2)(x)
# outputs=layers.Dense(num_classes, activation='softmax')(x)
# model=tf.keras.Model(inputs, outputs)

model= tf.keras.Sequential([
    layers.                            
    embeddings,
    layers.Conv1D(64, 5, padding="valid", activation="relu", strides=2),
    layers.GlobalMaxPooling1D(),                                        
    layers.Dense(4)
])

model.compile(optimizer='adam', 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=['accuracy'])

In [95]:
history=model.fit(train_ds, 
                  validation_data=val_ds, 
                  epochs=10, 
                  batch_size=32)

Epoch 1/10


ValueError: ignored

#Evaluate the model

#Plot loss curves
