In [1]:
!python -m spacy download en_core_web_lg

Collecting en_core_web_lg==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9MB)
[K     |████████████████████████████████| 827.9MB 1.2MB/s 
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-cp36-none-any.whl size=829180944 sha256=f6b530311fb0347d53964b01968bd0ba85b2c85189bab3041c72b3e6268bf0e1
  Stored in directory: /tmp/pip-ephem-wheel-cache-brd8sagf/wheels/2a/c1/a6/fc7a877b1efca9bc6a089d6f506f16d3868408f9ff89f8dbfc
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [1]:
# https://www.kaggle.com/c/whiskey-reviews-ds20/overview

import pandas as pd
import re
from collections import Counter
import spacy

nlp = spacy.load('en_core_web_lg')

In [174]:
# Loading in the data from the Kaggle webpage

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [175]:
train['description'][500]

"\nStyle: Bourbon Color: Antique amber Aroma: Thick, lush, very complex and nicely balanced. Notes of polished leather and oak marry nicely with the toffee and caramel. Tobacco, raisins, and dates, add complexity and diversity. Palate: Bold and voluptuous, with notes similar to its aroma. Long, soothing finish.\r\n\xa0\xa0\xa0General Comments: A big, broad-shouldered bourbon jam-packed with flavors that just won't quit. Most bourbons of this age and intensity are too woody. Not this one. It is very well-balanced from beginning to end. Bravo! Price: upper $30s. Available nationwide but produced in limited quantities."

In [176]:
def tokenize(text):
  tokens = []
  for item in (text.replace('\n', '').lower().replace('-', ' ').split()):
    tokens.append(re.sub('[^a-z ]', '', item))
  
  custom_stop_words = {"","and","the","a","of","with","is","in","to","this","on",
                       "it","its","but","the","year","old","which","that","cask",
                       "was","single","malt","whiskeys","whiskey","whisky","some",
                       "has","an","caramel","spice","tobacco","there","vanilla",
                       "fruit","layers","nose","notes","orange","lemon","grapefruit",
                       "apple","oranges","lemons","grapefruits","apples","wood",
                       "oak","bread","chocolate","milk","spices","rye","lumber",
                       "raisin","marmalade","grape","grapes","skin","sugar","plum",
                       "plums","cinnamon","bun","buns","raspberry","preserve",
                       "nuts","ash","mineral","minerals","palate","element",
                       "elements","burley","hint","hints","wine","dark","mild",
                       "fudge","soft","peppery","burst","bursts","onto","honey",
                       "lemon","dram","toffee","banana","bananas","peel","pie",
                       "grain","bourbon","color","amber","leather","marry",
                       "tobacco","raisins","dates","comments","general","just",
                       "blends","scotch","flowers","butter","cocoa","hazelnut",
                       "smoke","are","s","aroma","style","bourbons","age","be",
                       "by","how","when","each","will"}
  cleaned_tokens = []

  for word in tokens:
    if word not in custom_stop_words:
      cleaned_tokens.append(word)
    else:
      continue

  return cleaned_tokens

In [177]:
train['tokens'] = train['description'].apply(tokenize)

In [178]:
train.head(1)

Unnamed: 0,id,description,ratingCategory,tokens
0,1321,"\nSometimes, when whisky is batched, a few lef...",1,"[sometimes, batched, few, leftover, barrels, r..."


In [179]:
def to_lemmas(token_list):
  lemmas = ''
  doc = nlp(str(' '.join(token_list)))

  for token in doc:
    lemmas += token.lemma_ + ' '

  return lemmas

In [180]:
train['lemmas'] = train['tokens'].apply(to_lemmas)

In [181]:
train.head(1)

Unnamed: 0,id,description,ratingCategory,tokens,lemmas
0,1321,"\nSometimes, when whisky is batched, a few lef...",1,"[sometimes, batched, few, leftover, barrels, r...",sometimes batch few leftover barrel return war...


In [189]:
type(train['lemmas'][0])

str

In [191]:
X = train['lemmas']
y = train['ratingCategory']

assert len(X_train) == len(y_train)


In [199]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

# Creating pipeline components
vectorizer = TfidfVectorizer(ngram_range=(2,3))
classifier = RandomForestClassifier(random_state=42)


# Define the Pipeline
pipe = Pipeline([('vect', vectorizer), # vectorizer
                 ('clf', classifier) # classifier
                ])

# Define the parameter space for the grid serach
parameters = {
    'clf__criterion':['gini', 'entropy'],
    'clf__max_depth':[5,10,15,20],
    'clf__n_estimators':[10, 100, 250]}


# Implement a grid search with cross-validation
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(pipe, parameters, cv=5, n_jobs=5, verbose=1)
grid_search.fit(X, y);

# Print out the best score
grid_search.best_score_

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:   56.3s
[Parallel(n_jobs=5)]: Done 120 out of 120 | elapsed:  3.1min finished


0.7054077024596518

In [200]:
# Predictions on test sample
pred = grid_search.predict(test['description'])

In [201]:
submission = pd.DataFrame({'id': test['id'], 'ratingCategory':pred})
submission['ratingCategory'] = submission['ratingCategory'].astype('int64')

In [202]:
# Make Sure the Category is an Integer
submission.head()

Unnamed: 0,id,ratingCategory
0,3461,1
1,2604,1
2,3341,1
3,3764,1
4,2306,1


In [203]:
subNumber = 11

# Save your Submission File
# Best to Use an Integer or Timestamp for different versions of your model

submission.to_csv(f'submission{subNumber}.csv', index=False)
subNumber += 1