In [1]:
import os
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense

Using TensorFlow backend.


In [2]:
# read data
rawData = pd.read_csv(os.path.join("FD test NLP.csv"))
rawData.head()

Unnamed: 0.1,Unnamed: 0,name,category,review.point,price,currency,description
0,1,"Johnnie Walker Blue Label, 40%",Blended Scotch Whisky,97,225,$,"Magnificently powerful and intense. Caramels, ..."
1,2,"Black Bowmore, 1964 vintage, 42 year old, 40.5%",Single Malt Scotch,97,4500,$,What impresses me most is how this whisky evol...
2,3,"Bowmore 46 year old (distilled 1964), 42.9%",Single Malt Scotch,97,13500,$,There have been some legendary Bowmores from t...
3,4,"Compass Box The General, 53.4%",Blended Malt Scotch Whisky,96,325,$,With a name inspired by a 1926 Buster Keaton m...
4,5,"Chivas Regal Ultis, 40%",Blended Malt Scotch Whisky,96,160,$,"Captivating, enticing, and wonderfully charmin..."


In [3]:
# descriptions
rawData['description'].head()

0    Magnificently powerful and intense. Caramels, ...
1    What impresses me most is how this whisky evol...
2    There have been some legendary Bowmores from t...
3    With a name inspired by a 1926 Buster Keaton m...
4    Captivating, enticing, and wonderfully charmin...
Name: description, dtype: object

In [4]:
# vectorize description: take the words of each description and create a vocabulary of all the unique words in the descriptions.
# This vocabulary can then be used to create a feature vector of the count of the words:
vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(rawData['description'])
vectorizer.vocabulary_

{'Magnificently': 1833,
 'powerful': 7832,
 'and': 3122,
 'intense': 6289,
 'Caramels': 891,
 'dried': 4858,
 'peats': 7573,
 'elegant': 4983,
 'cigar': 4020,
 'smoke': 8936,
 'seeds': 8664,
 'scraped': 8620,
 'from': 5611,
 'vanilla': 10014,
 'beans': 3403,
 'brand': 3642,
 'new': 7177,
 'pencils': 7595,
 'peppercorn': 7604,
 'coriander': 4339,
 'star': 9168,
 'anise': 3126,
 'make': 6764,
 'for': 5543,
 'deeply': 4580,
 'satisfying': 8563,
 'nosing': 7231,
 'experience': 5215,
 'Silky': 2443,
 'caramels': 3835,
 'bountiful': 3622,
 'fruits': 5626,
 'of': 7301,
 'ripe': 8402,
 'peach': 7552,
 'stewed': 9207,
 'apple': 3170,
 'orange': 7373,
 'pith': 7720,
 'pervasive': 7657,
 'with': 10263,
 'elements': 4986,
 'burnt': 3759,
 'tobacco': 9712,
 'An': 583,
 'abiding': 2916,
 'finish': 5408,
 'dry': 4890,
 'spices': 9091,
 'banoffee': 3354,
 'pie': 7686,
 'sweetness': 9452,
 'Close': 964,
 'to': 9708,
 'perfection': 7620,
 'Editor': 1219,
 'Choice': 939,
 'What': 2851,
 'impresses': 6172

In [5]:
# create vector with all words for each description = Bag-of-words (BOW) model
vectorizer.transform(rawData['description']).toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [6]:
#  split data
descriptions = rawData['description'].values
y = rawData['review.point'].values
# y = rawData['price'].values

descriptions_train, descriptions_test, y_train, y_test = train_test_split(descriptions, y, test_size=0.25, random_state=1000)

In [7]:
# vectorize training descriptions
vectorizer = CountVectorizer()
vectorizer.fit(descriptions_train)
X_train = vectorizer.transform(descriptions_train)
X_test  = vectorizer.transform(descriptions_test)
X_train

<1685x7980 sparse matrix of type '<class 'numpy.int64'>'
	with 95581 stored elements in Compressed Sparse Row format>

In [8]:
# <1685x7980 sparse matrix of type '<class 'numpy.int64'>'
# 	with 95581 stored elements in Compressed Sparse Row format>
# this is 1685 samples with 7980 dimensions, i.e. words

In [9]:
# logistic regression classification model
classifier = LogisticRegression()
classifier.fit(X_train, y_train) # vectorized training data
score = classifier.score(X_test, y_test)
score



0.12811387900355872

In [10]:
input_dim = X_train.shape[1]  # Number of features

model = Sequential()
model.add(Dense(10, input_dim=input_dim, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

W0722 16:42:43.332476  8352 deprecation_wrapper.py:119] From C:\ProgramData\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0722 16:42:43.359462  8352 deprecation_wrapper.py:119] From C:\ProgramData\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0722 16:42:43.362459  8352 deprecation_wrapper.py:119] From C:\ProgramData\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.



In [11]:
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
model.summary()

W0722 16:42:43.421425  8352 deprecation_wrapper.py:119] From C:\ProgramData\Anaconda3\lib\site-packages\keras\optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0722 16:42:43.457405  8352 deprecation_wrapper.py:119] From C:\ProgramData\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:3376: The name tf.log is deprecated. Please use tf.math.log instead.

W0722 16:42:43.466401  8352 deprecation.py:323] From C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 10)                79810     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
Total params: 79,821
Trainable params: 79,821
Non-trainable params: 0
_________________________________________________________________


In [12]:
history = model.fit(X_train, y_train,
                    epochs=100,
                    verbose=False,
                    validation_data=(X_test, y_test),
                    batch_size=10)

W0722 16:42:43.678439  8352 deprecation_wrapper.py:119] From C:\ProgramData\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.



In [13]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.0000
Testing Accuracy:  0.0000
