In [162]:
import pandas as pd
import numpy as np

In [163]:
data = pd.read_csv('/content/drive/MyDrive/AI FOR COLAB/notebooks TF course/text classification/ecommerceDataset.csv', header=None)
data.columns = ['type', 'text']

data = data.dropna()
data = data.sample(frac=1)

TEXT_DATA = np.array(data['text'])
TYPE_DATA = np.array(data['type'])

data.head()

Unnamed: 0,type,text
28095,Books,"Nature, Garden and Forest: Colouring Books for..."
4951,Household,Fancy Mart Artificial Maple Tree with White Sq...
16871,Household,ZEYA Fast Dry Automatic Sensor High Jet Speed ...
6290,Household,Story@Home Premium Blackout Solid 2-Piece Faux...
15532,Household,Philips Handheld 1000-Watt Garment Steamer (GC...


In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
VOCAB_SIZE=10000
OOV_TOKEN='<BLNK>'
PADDING_TYPE='post'
TRUNC_TYPE='post'
MAX_LEN=100

tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token=OOV_TOKEN)
tokenizer.fit_on_texts(TEXT_DATA)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(data['text'])
padded = pad_sequences(sequences, maxlen=MAX_LEN, padding=PADDING_TYPE, truncating=TRUNC_TYPE)

TEXT_DATA = padded
TYPE_DATA = pd.get_dummies(data['type'])

In [None]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(TEXT_DATA, TYPE_DATA, test_size=0.2)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LSTM, Bidirectional, Lambda, Embedding, Conv1D, GlobalMaxPooling1D, Flatten

EMBEDDING_DIM = 64

model = Sequential()
model.add(Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_LEN))
model.add(Bidirectional(LSTM(16, return_sequences=True)))
model.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(4, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history = model.fit(xtrain, ytrain,
          epochs=2,
          validation_data=(xtest, ytest))

Epoch 1/2
Epoch 2/2


In [161]:
res = {
    0: 'Books',
    1: 'Clothing & Accessories',
    2: 'Electronics',
    3: 'Household'
}

category = 'scissors iron broom'
res[np.argmax(model.predict(pad_sequences(tokenizer.texts_to_sequences([category]), maxlen=MAX_LEN, padding=PADDING_TYPE, truncating=TRUNC_TYPE)))]



'Household'

In [158]:
class dataProcess():
  def __init__(self, TEXT_DATA, TYPE_DATA, VOCAB_SIZE, OOV_TOKEN, PADDING_TYPE, TRUNC_TYPE, MAX_LEN):
    self.TEXT_DATA = TEXT_DATA
    self.TYPE_DATA = TYPE_DATA
    self.VOCAB_SIZE = VOCAB_SIZE
    self.OOV_TOKEN = OOV_TOKEN
    self.PADDING_TYPE = PADDING_TYPE
    self.TRUNC_TYPE = TRUNC_TYPE
    self.MAX_LEN = MAX_LEN

  def preprocess(self):
    tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token=OOV_TOKEN)
    tokenizer.fit_on_texts(self.TEXT_DATA)
    word_index = tokenizer.word_index
    sequences = tokenizer.texts_to_sequences(self.TEXT_DATA)
    padded = pad_sequences(sequences, maxlen=MAX_LEN, padding=PADDING_TYPE, truncating=TRUNC_TYPE)
    return padded

  def train_test(self):
    text_data = self.preprocess()
    xtrain, xtest, ytrain, ytest = train_test_split(text_data, self.TYPE_DATA, test_size=0.2)
    return xtrain, xtest, ytrain, ytest

  def data_handler(self):
    return self.train_test()



In [None]:
data_pr = dataProcess(TEXT_DATA, TYPE_DATA, VOCAB_SIZE, OOV_TOKEN, PADDING_TYPE, TRUNC_TYPE, MAX_LEN)
xtrain, xtest, ytrain, ytest = data_pr.data_handler()

In [None]:
class model():
  def __init__(self, xtrain, xtest, ytrain, ytest, epochs, input_class):
    self.xtrain = xtrain
    self.xtest = xtest
    self.ytrain = ytrain
    self.ytest = ytest
    self.input_class = input_class

  def create_model(self):
    model = Sequential()
    model.add(Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_LEN))
    model.add(Bidirectional(LSTM(16, return_sequences=True)))
    model.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dense(4, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

  def init_model(self):
    self.create_model()
    model.fit(xtrain, ytrain,
          epochs=self.epochs,
          validation_data=(xtest, ytest))

In [None]:
model = model(xtrain, xtest, ytrain, ytest, 50, data_pr)
model.init_model()

In [167]:
word_index

{'<BLNK>': 1,
 'the': 2,
 'and': 3,
 'of': 4,
 'to': 5,
 'a': 6,
 'for': 7,
 'in': 8,
 'with': 9,
 'is': 10,
 'your': 11,
 'you': 12,
 'it': 13,
 'this': 14,
 'on': 15,
 'that': 16,
 'from': 17,
 'or': 18,
 'can': 19,
 'as': 20,
 'are': 21,
 '1': 22,
 'be': 23,
 'an': 24,
 'all': 25,
 'has': 26,
 '2': 27,
 'at': 28,
 'by': 29,
 '3': 30,
 '5': 31,
 'will': 32,
 'size': 33,
 'book': 34,
 'x': 35,
 'set': 36,
 'use': 37,
 'easy': 38,
 'quality': 39,
 'one': 40,
 'up': 41,
 'not': 42,
 'which': 43,
 'about': 44,
 'have': 45,
 'high': 46,
 'also': 47,
 'he': 48,
 'black': 49,
 'product': 50,
 '4': 51,
 'made': 52,
 'home': 53,
 'design': 54,
 'our': 55,
 'more': 56,
 'author': 57,
 'his': 58,
 'any': 59,
 'its': 60,
 'color': 61,
 'time': 62,
 'power': 63,
 'new': 64,
 'we': 65,
 '6': 66,
 'cotton': 67,
 'these': 68,
 'other': 69,
 'usb': 70,
 'best': 71,
 'perfect': 72,
 'light': 73,
 'comes': 74,
 'most': 75,
 'make': 76,
 'features': 77,
 'no': 78,
 'so': 79,
 'used': 80,
 'material': 81