In [1]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, Embedding, Dropout
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
from operator import itemgetter

import pandas as pd
from collections import Counter,OrderedDict

from sklearn.model_selection import train_test_split

In [2]:
from nltk import word_tokenize
from collections import defaultdict

def count_top_x_words(corpus, top_x, skip_top_n):
    count = defaultdict(lambda: 0)
    for c in corpus:
        for w in word_tokenize(c):
            count[w] += 1
    count_tuples = sorted([(w, c) for w, c in count.items()], key=lambda x: x[1], reverse=True)
    return [i[0] for i in count_tuples[skip_top_n: skip_top_n + top_x]]


def replace_top_x_words_with_vectors(corpus, top_x):
    topx_dict = {top_x[i]: i for i in range(len(top_x))}

    return [
        [topx_dict[w] for w in word_tokenize(s) if w in topx_dict]
        for s in corpus
    ], topx_dict


def filter_to_top_x(corpus, n_top, skip_n_top=0):
    top_x = count_top_x_words(corpus, n_top, skip_n_top)
    return replace_top_x_words_with_vectors(corpus, top_x)

In [28]:
df = pd.read_csv('../data/raw/tp_calssification_training_data_05-08-2020.csv',sep="\x01")

In [29]:
df.head()

Unnamed: 0,path,ppKw
0,/c/kp/deep-conditioning-treatments,hair conditioning treatment
1,/c/kp/sun-hats,women's sun hat
2,/c/kp/cookie-decorating-sugars-sprinkles,easter sprinkle
3,/c/kp/webcams-with-mic,usb camera mic
4,/c/kp/garter-belts,garter belt


In [30]:
df.shape

(1450933, 2)

In [31]:
df.ppKw.nunique()

1150490

In [32]:
df.path.nunique()

102982

In [33]:
idData = df.groupby("path").count()
idData = idData.reset_index()
idData["target"] = range(0,idData.shape[0])

In [34]:
idData.head()

Unnamed: 0,path,ppKw,target
0,/c//-5-baby-buys,2,0
1,/c//3-speed-bikes,4,1
2,/c//3d-movies,2,2
3,/c//49ers,9,3
4,/c//anniversary-rings,1,4


In [35]:
df = pd.merge(df, idData, how='inner', on="path",
         suffixes=('_x', '_y'))

In [36]:
df = df[["ppKw_x","target"]]
df.columns = ["text","target"]
df.text = df.text.astype("string")

In [37]:
df.head(10)

Unnamed: 0,text,target
0,hair conditioning treatment,51773
1,best hair mask,51773
2,hair masque,51773
3,deep hair conditioner,51773
4,deep conditioner,51773
5,deep conditioning hair mask,51773
6,best deep conditioner,51773
7,deep conditioner packet,51773
8,hair mask,51773
9,hair mask packet,51773


In [38]:
df.target.nunique()

102982

In [8]:
"""
changing lables into one hot encoded vectors
topn_labelsIndex: is a dictionary with key is labels and value as index


topn = 10
freqCountLabel = Counter(df['target'].tolist())
topn_labelsIndex = {i[0]: idx for idx, i in enumerate(freqCountLabel.most_common(topn))}
labelIndexList = [topn_labelsIndex[i] for i in df['target'].tolist()]
labelOHE = to_categorical(labelIndexList)

"""

In [56]:
df.dtypes

text      string
target     int64
dtype: object

In [None]:
"abcd"
"hgfc12"

In [54]:
"""
pad each input sequence to make it a fixed length
padding and truncating can be post or pre: How to decide??

https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/sequence/pad_sequences
"""
#this might now be good because brand and some product 
nVocab = 25000
topNIgnore = 0
listText = df['text'].tolist()
TextIndex, vocab_Dict = filter_to_top_x(listText, nVocab, topNIgnore)

maxTextLength = 10
paddedTextIndex = sequence.pad_sequences(TextIndex, maxlen=maxTextLength, \
                                        padding='post',truncating='post')

In [10]:
sentLengthDist = pd.DataFrame([len(sent) for sent in TextIndex], \
                              columns=["sentLength"])["sentLength"].value_counts()
sentLengthDist.head()

10    555
9     535
12    492
7     491
8     483
Name: sentLength, dtype: int64

In [57]:
"""split the data into test and train 
"""
train_x, test_x, train_y, test_y = train_test_split(paddedTextIndex, df.target, test_size=0.3)

In [None]:


"""
sequential model has sequence of layers

input squence
7613X27X2500
NText*maxTextLength*nVocab

oooo...o1oo-2500
oo1o...oooo
.
.
.
oooo...o1oo
27

weight matrix
2500X100


embedding layer output
7613X27X100
NText*maxTextLength*nVocab

number of filter:hyper parameter to tune (10 for starting)

kernal
3x100



conv1D output
27X10



"""

embedding_vector_length = 500
model = Sequential()

model.add(Embedding(nVocab, embedding_vector_length, input_length=maxTextLength))

#https://www.tensorflow.org/api_docs/python/tf/keras/layers/Conv1D
model.add(Conv1D(10, 3,padding='same',strides=1))

model.add(Flatten())

model.add(Dropout(0.5))

model.add(Dense(100, activation='relu'))

model.add(Dropout(0.5))

model.add(Dense(50, activation='relu'))


model.add(Dense(20, activation='relu'))

nLables = idData.shape[0] + 1

model.add(Dense(nLables, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_x, train_y, epochs=15, batch_size=32)

y_score = model.predict(test_x)

y_score = [[1 if i == max(sc) else 0 for i in sc] for sc in y_score]
n_right = 0
for i in range(len(y_score)):
    if all(y_score[i][j] == test_y[i][j] for j in range(len(y_score[i]))):
        n_right += 1

print("Accuracy: %.2f%%" % ((n_right/float(len(test_y)) * 100)))

Epoch 1/15
 1429/31740 [>.............................] - ETA: 1:48:56 - loss: 11.1692 - accuracy: 7.2166e-04