In [36]:
#!pip install gensim
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
import numpy as np
import re
import tensorflow as tf
import gensim
from gensim.models import Word2Vec
from keras.models import Sequential
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from keras.utils import to_categorical
from keras.layers import Dense, Conv1D, Flatten, MaxPooling1D, Dropout
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
stopwords = list(set(stopwords.words("english")))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
def unique(sequence):
    """
        function to remove duplicate values without losing the order
        http://www.martinbroadhurst.com/removing-duplicates-from-a-list-while-preserving-order-in-python.html
    """
    seen = set()
    return [x for x in sequence if not (x in seen or seen.add(x))]

In [0]:
# one_hot maker
def one_hot(vector):
    one_hot_vec = np.zeros([len(vector), int(max(vector[0:len(vector)])+1)])
    
    for i in range(len(one_hot_vec)):
        one_hot_vec[i, vector[i, 0]] = 1
       
    return one_hot_vec

In [83]:
# downloading data
url = "https://raw.githubusercontent.com/aashishksahu/Deep-Learning/master/Text%20classification/Womens%20Clothing%20E-Commerce%20Reviews.csv"
data = pd.DataFrame(pd.read_csv(url))
# since some reviews are empty, they're being dropped
data = data.dropna(axis=0)
#data = data.sample(frac=1)
data.head()

Unnamed: 0,Review Text,Rating,Positive Feedback Count
0,Absolutely wonderful - silky and sexy and comf...,4,0
1,Love this dress! it's sooo pretty. i happene...,5,4
2,I had such high hopes for this dress and reall...,3,0
3,"I love, love, love this jumpsuit. it's fun, fl...",5,0
4,This shirt is very flattering to all due to th...,5,6


In [84]:
# processing the text
text = list(data['Review Text'])

# sample
print(text[0])
print(text[3])

print("\n* After removing punctuations and full stops:\n")
text = [re.findall(r'\w+[A-Za-z]', w.lower()) for w in text]

print(text[0])
print(text[3])

print("\n* Removing stopwords: \n")
for i in range(len(text)):
    text[i] = [w for w in text[i] if not w in stopwords]

print(text[0])
print(text[3])

print("\n* Lemmatizing: \n")
wnl = WordNetLemmatizer()
for i in range(len(text)):
    text[i] = [wnl.lemmatize(word) for word in text[i]]

print(text[0])
print(text[3])
    
print("\n* Removing duplicates")
text = [unique(s) for s in text]

print(text[0])
print(text[3])

# finally replacing raw text with processed one

text = [" ".join(t) for t in text]

Absolutely wonderful - silky and sexy and comfortable
I love, love, love this jumpsuit. it's fun, flirty, and fabulous! every time i wear it, i get nothing but great compliments!

* After removing punctuations and full stops:

['absolutely', 'wonderful', 'silky', 'and', 'sexy', 'and', 'comfortable']
['love', 'love', 'love', 'this', 'jumpsuit', 'it', 'fun', 'flirty', 'and', 'fabulous', 'every', 'time', 'wear', 'it', 'get', 'nothing', 'but', 'great', 'compliments']

* Removing stopwords: 

['absolutely', 'wonderful', 'silky', 'sexy', 'comfortable']
['love', 'love', 'love', 'jumpsuit', 'fun', 'flirty', 'fabulous', 'every', 'time', 'wear', 'get', 'nothing', 'great', 'compliments']

* Lemmatizing: 

['absolutely', 'wonderful', 'silky', 'sexy', 'comfortable']
['love', 'love', 'love', 'jumpsuit', 'fun', 'flirty', 'fabulous', 'every', 'time', 'wear', 'get', 'nothing', 'great', 'compliment']

* Removing duplicates
['absolutely', 'wonderful', 'silky', 'sexy', 'comfortable']
['love', 'jumpsuit', 

In [39]:

# converting text to TF-IDF score
# Using TF-IDF because in text classification we need the importance of
# special words that indicate what kind of review it is

vectorizer = TfidfVectorizer(stop_words='english')
tfidf = vectorizer.fit_transform(text).toarray()
# toarray pads the sequences with zeros to make all sequences of equal length
print("tfidf  : ", tfidf.shape)

# labels for our training data 
# a score from 1 to 5

rating = one_hot(np.asarray(data["Rating"]).reshape(len(data["Rating"]), 1))

print("rating : ", rating.shape)

min_max = MinMaxScaler()
vectors = min_max.fit(tfidf)
vectors = min_max.transform(tfidf)

tfidf  :  (22641, 12253)
rating :  (22641, 6)


In [46]:
# making training and testing data
split = int(len(vectors)*0.75)

x_train = vectors[0:split]
x_train = x_train.reshape(x_train.shape[0], x_train.shape[1], 1)
y_train = data["Rating"][0:split]
y_train = to_categorical(y_train.reshape(len(y_train),1), num_classes=6)


x_test = vectors[split:len(vectors)]
x_test = x_test.reshape(x_test.shape[0], x_test.shape[1], 1)
y_test = data["Rating"][split:len(data["Rating"])]
y_test = to_categorical(y_test.reshape(len(y_test),1), num_classes=6)

### placeholders
x = tf.placeholder(tf.float32, [None, x_train.shape[1], 1])
y = tf.placeholder(tf.float32, [None, 6])

  
  if sys.path[0] == '':


In [66]:
model = Sequential()
# bigger kernel to reduce the zeros
model.add(Conv1D(32, (4), strides=(1), input_shape=(x_train.shape[1],1), padding='same', activation='relu'))
model.add(Conv1D(32, (4), strides=(1), padding='same', activation='relu'))
model.add(Conv1D(32, (2), strides=(1), padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=(3), strides=(2), padding="same"))
model.add(Dropout(0.75))
model.add(Flatten())
model.add(Dense(6, activation='softmax'))
model.compile(loss = "categorical_crossentropy", optimizer="nadam",  metrics = ['accuracy'])
model.summary()
model.fit(x_train, y_train, epochs=1, batch_size=100)
model.evaluate(x_test, y_test)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_113 (Conv1D)          (None, 12253, 32)         160       
_________________________________________________________________
conv1d_114 (Conv1D)          (None, 12253, 32)         4128      
_________________________________________________________________
conv1d_115 (Conv1D)          (None, 12253, 32)         2080      
_________________________________________________________________
max_pooling1d_37 (MaxPooling (None, 6127, 32)          0         
_________________________________________________________________
dropout_20 (Dropout)         (None, 6127, 32)          0         
_________________________________________________________________
flatten_24 (Flatten)         (None, 196064)            0         
_________________________________________________________________
dense_26 (Dense)             (None, 6)                 1176390   
Total para

[0.9674304510738041, 0.6034269563365459]

In [86]:
u = np.random.randint(1000, size=10)

for i in u:
    print("Text   : ", data["Review Text"][i])
    sample = x_test[i].reshape(1, x_test[i].shape[0],1)
    print("Rating : ", np.argmax(model.predict(sample)))

Text   :  I'm a 110 lb, shorty with a short torso and long arms so the standard xs was huge. i had the opportunity to try on both the xxs petite and xs petite. the xxs petite was almost perfect but i wanted it more flowy - probably best for those that are even shorter and more lightweight than me, othewise, it looks a little too boxy. the petite xs fit me the best and draped nicely, similar to the model in the pink. i had a sweater with this exact silhouette 5+ years ago and wore it to death even though i
Rating :  5
Text   :  I purchased this blouse because i love a 70's vibe in my tops. it is a beautiful, colorful top, but the colors weren't flattering on me. having said that, the cut is nice, the fabric is lightweight and flows nicely, and the fit was fine on me. i am a curvy 5'5" with a 36 c cup. go for it if this is a style you like. one other note, i wish it had been a bit longer, but i am older and prefer a little more coverage. it's just a personal preference. i think the pictu

# Findings
1. CNNs can be used for text classification
2. The length of sentences should be nearly the same 
3. The reason for low accuracy is the zero padding to make sentence length equal, min length of sentence is 2 and max length is 56
     this variation in length causes the CNN to saturate at 56% accuracy, this is the major reason