# Recommendation System

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer 
from rake_nltk import Rake
import warnings
warnings.filterwarnings("ignore")

### Wine Recommendation based on Description

Wine with similar descriptions will be recommended given a description of certain wine

In [3]:
# Import wine dataset with wine titles
df = pd.read_csv('winemag-data-130k-v2.csv')

In [4]:
df.drop('Unnamed: 0',axis=1,inplace=True)

In [5]:
df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


Use Rake to extract key words for the description

In [6]:
sentence_lst = []

for index, row in df.iterrows():
    des = row['description']
    # instantiating Rake (English stopwords from NLTK and discards all puntuation characters)
    r = Rake()
    # extracting the words by passing the text
    r.extract_keywords_from_text(des)
    # getting the dictionary whith key words as keys and their scores as values
    key_words_dict_scores = r.get_word_degrees()
    # assigning the key words to the new column for the corresponding movie
    sentence_lst.append(list(key_words_dict_scores.keys()))

Tag words using nltk.pos_tag

In [7]:
taggedList = []
for sentence in sentence_lst:
    sentList = nltk.sent_tokenize(' '.join(sentence))
    txt_list = nltk.word_tokenize(sentList[0])
    tagged_sentence = nltk.pos_tag(txt_list)
    taggedList.append(tagged_sentence)

In [8]:
taggedList

[[('citrus', 'NN'),
  ('overly', 'RB'),
  ('expressive', 'JJ'),
  ('dried', 'VBD'),
  ('herb', 'NN'),
  ('brimstone', 'NN'),
  ('broom', 'NN'),
  ('offering', 'NN'),
  ('unripened', 'JJ'),
  ('apple', 'NN'),
  ('sage', 'NN'),
  ('alongside', 'RB'),
  ('brisk', 'JJ'),
  ('acidity', 'NN'),
  ('aromas', 'VBP'),
  ('include', 'VBP'),
  ('tropical', 'JJ'),
  ('fruit', 'NN'),
  ('palate', 'NN')],
 [('fruity', 'NN'),
  ('still', 'RB'),
  ('structured', 'VBD'),
  ('juicy', 'NN'),
  ('red', 'JJ'),
  ('berry', 'NN'),
  ('fruits', 'NNS'),
  ('already', 'RB'),
  ('drinkable', 'JJ'),
  ('better', 'JJR'),
  ('filled', 'VBN'),
  ('ripe', 'NN'),
  ('certainly', 'RB'),
  ('wine', 'VBZ'),
  ('although', 'IN'),
  ('2016', 'CD'),
  ('freshened', 'VBD'),
  ('acidity', 'NN'),
  ('smooth', 'JJ'),
  ('firm', 'NN'),
  ('tannins', 'NNS')],
 [('flavors', 'NNS'),
  ('lime', 'VBP'),
  ('flesh', 'JJ'),
  ('crisp', 'NN'),
  ('acidity', 'NN'),
  ('underscoring', 'VBG'),
  ('tart', 'JJ'),
  ('stainless', 'NN'),
  ('ri

Only keep noun and adjective from the sentence

In [8]:
new_sentence_lst = []
temp = []
for sentence in taggedList:
    for word in sentence:
        if word[1] in ['NN','NNS','JJ','JJR']:
            temp.append(word[0])
    new_sentence_lst.append(temp)
    temp = []

Remove remaining stop words

In [9]:
stop_words = set(stopwords.words('english'))
Final_sentence_list = []
Final_sentence = []
for sentence in new_sentence_lst:
    for word in sentence:
        if(word not in stop_words):
            Final_sentence.append(word)
    Final_sentence_list.append(Final_sentence)
    Final_sentence = []

In [10]:
combined_sentence = []
for i in Final_sentence_list:
    combined_sentence.append(' '.join(i))

Use doc2vec and train the model

In [16]:
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec

tagged_data = [TaggedDocument(words=_d, tags=[str(i)]) for i, _d in enumerate(Final_sentence_list)]
max_epochs = 2000
vec_size = 600
alpha = 0.025

model = Doc2Vec(tagged_data,vector_size=vec_size,
                window=5,
                sample=1e-3,
                min_count=1,worker=4,negative=5)
model.train(tagged_data,total_examples=model.corpus_count,epochs=100)
model.save("d2v.model")
print("Model Saved")

Model Saved


Given a description using the same method to process the sentence

In [17]:
test = 'Aromas include tropical fruit, broom, brimstone and dried herb. The palate is not overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity.'
r = Rake()
# extracting the words by passing the text
r.extract_keywords_from_text(test)
# getting the dictionary whith key words as keys and their scores as values
key_words_dict_scores = r.get_word_degrees()
tagged_sentence = nltk.pos_tag(list(key_words_dict_scores.keys()))
test_sentence = []
for word in tagged_sentence:
    if word[1] in ['NN','NNS','JJ','JJR']:
        test_sentence.append(word[0])

Compute the similarity between test description vector to the existing description vectors and select wines with highest similarity score

In [18]:
model = Doc2Vec.load("d2v.model")
#to find the vector of a document which is not in training data
v1 = model.infer_vector(doc_words=test_sentence,alpha=0.0025,steps=5000)

# to find most similar doc using tags
similar_doc = model.docvecs.most_similar([v1],topn=10)
top_recommendation=[]
for count,similar in similar_doc:
    top_recommendation.append(df.title[int(count)])
    sentence=df.description[int(count)]
    words=''
    for  word in sentence:
        words=words+word+''
    print(words+' similartiy score: '+ str(similar))

Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity. similartiy score: 0.7694101333618164
Made entirely with native grape Zibibbo, this opens with aromas of sage, tangerine zest and Mediterranean herb. The aromas follow through to the lively palate alongside brisk acidity and a saline note. similartiy score: 0.36678361892700195
Aromas of green apple, Bartlett pear, citrus and sage lead into the lightly sparkling palate. Fresh acidity lifts the rich flavors. similartiy score: 0.3343934714794159
The nose on this isn't very expressive but the palate reveals crunchy Bartlett pear and yellow apple, while a hint of sage in the background adds interest. Brisk acidity gives it a zesty finish. similartiy score: 0.31226807832717896
Subdued aromas of Spanish broom and brimstone float from the glass. The vertical palate offers yellow apple, citrus zest and mineral alongside crisp

In [19]:
print(f'Top 5 recommended wine based on description: {test}\n {top_recommendation[1:6]}')

Top 5 recommended wine based on description: Aromas include tropical fruit, broom, brimstone and dried herb. The palate is not overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity.
 ['Coste Ghirlanda 2013 Silenzio Zibibbo (Terre Siciliane)', 'Valdo NV Cuvée Viviana  (Valdobbiadene Superiore di Cartizze)', 'Marilena Barbera 2013 Coste al Vento Grillo (Sicilia)', 'Contrada Santo Spirito di Passopisciaro 2012 Animalucente Bianco  (Etna)', 'Signae 2011  Montefalco Sagrantino']


### Grape Recommendation based on common words in Description

Grape with more common words in description column will be recommended given the name of a grape variety

In [2]:
# Use cleaned wine dataset
data = pd.read_csv('clean_wine_dataset.csv')

In [3]:
data.head()

Unnamed: 0,description,designation,points,price,province,variety,winery
0,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Sauvignon Blanc,Macauley
1,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Pinot Noir,Ponzi
2,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Provence red blend,Domaine de la Bégude
3,"Deep, dense and pure from the opening bell, th...",Numanthia,95,73.0,Northern Spain,Tinta de Toro,Numanthia
4,Slightly gritty black-fruit aromas include a s...,San Román,95,65.0,Northern Spain,Tinta de Toro,Maurodos


In [4]:
# Create a new dataframe with only variety and description columns
df = data[['variety','description']]

In [5]:
df.shape

(152261, 2)

In [6]:
df.nunique()
# There are 735 unique grape varieties

variety           735
description    152261
dtype: int64

In [7]:
# Count the number of descriptions per variety
des_number = df['variety'].value_counts()
des_number

Pinot Noir                  15503
Chardonnay                  14439
Cabernet Sauvignon          12269
Red Blend                   10317
Sauvignon Blanc              6549
                            ...  
Bobal-Cabernet Sauvignon        1
Früburgunder                    1
Kotsifali                       1
Pied de Perdrix                 1
Cesanese                        1
Name: variety, Length: 735, dtype: int64

In [8]:
# Convert to Dataframe
df_des_number = pd.DataFrame({'variety':des_number.index, 'des_number':des_number.values})
df_des_number

Unnamed: 0,variety,des_number
0,Pinot Noir,15503
1,Chardonnay,14439
2,Cabernet Sauvignon,12269
3,Red Blend,10317
4,Sauvignon Blanc,6549
...,...,...
730,Bobal-Cabernet Sauvignon,1
731,Früburgunder,1
732,Kotsifali,1
733,Pied de Perdrix,1


In [9]:
df_des_number[(df_des_number['des_number']>1)].shape
# Out of 735 grape varieties, 589 have more than one descriptions

(589, 2)

In [10]:
# Create a list of grape varieties that have more than one descriptions
multi_des = df_des_number[(df_des_number['des_number']>1)]['variety'].tolist()
len(multi_des)

589

In [11]:
# Create a list of grape varieties that have only one description
one_des = df_des_number[(df_des_number['des_number']==1)]['variety'].tolist()
len(one_des)

146

In [12]:
df.set_index('variety', inplace = True)

Extract common words from the descriptions of grape varieties that have more than one description

In [13]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

df1 = pd.DataFrame(columns=['variety','description'])

# Define a CountVectorizer object
cv = CountVectorizer(stop_words='english', ngram_range=(2,2))
# Define a TfidfTransformer object
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)

for grape in multi_des:
    
    df2 = df.loc[[grape]]
    # Generate word counts
    word_count_vector = cv.fit_transform(df2['description'])
    # Compute the IDF values
    tfidf_transformer.fit(word_count_vector)
    # Obtain top 100 common words (meaning low IDF values) used in the reviews
    df2_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(), columns=["idf_weights"])
    df2_idf.sort_values(by=["idf_weights"], inplace=True)
    common_words = df2_idf.iloc[:100].index.tolist()
    # Convert the list to a string
    common_words_str = ", ".join(elem for elem in common_words)
    
    # Add the variety and its common words in descriptions to a new dataframe
    new_row = {'variety':grape, 'description':common_words_str}
    df1 = df1.append(new_row, ignore_index=True)

In [14]:
df1

Unnamed: 0,variety,description
0,Pinot Noir,"pinot noir, black cherry, cherry fruit, raspbe..."
1,Chardonnay,"buttered toast, tropical fruit, fruit flavors,..."
2,Cabernet Sauvignon,"black cherry, black currant, cabernet sauvigno..."
3,Red Blend,"cabernet sauvignon, black cherry, palate offer..."
4,Sauvignon Blanc,"sauvignon blanc, passion fruit, tropical fruit..."
...,...,...
584,Moscato Rosa,"zest hint, hint cake, refreshing acidity, oran..."
585,Sercial,"acids long, old wood, perfumed spirits, price ..."
586,Rara Neagra,"ripe cherry, acidity pleasantly, lovely wine, ..."
587,Chancellor,"answer gamay, hold attention, hue example, iow..."


Remove stop words in descriptions of grape varieties that have only one description

In [15]:
stop_words = set(stopwords.words('english'))
df3 = pd.DataFrame(columns=['variety','description'])

for grape in one_des:
    df4 = df.loc[[grape]]
    for word in df4['description']:
        new_des = []
        if word not in stop_words:
            new_des.append(word)
    new_row = {'variety':grape, 'description':", ".join(elem for elem in new_des)}
    df3 = df3.append(new_row, ignore_index=True) 

In [16]:
df3

Unnamed: 0,variety,description
0,Moschofilero-Chardonnay,This Moschofilero/Chardonnay blend feels a bit...
1,Shiraz-Merlot,"A ripe, juicy wine, layering red plums, rhubar..."
2,Schwartzriesling,Dark shades of bramble and smoked nuts lend a ...
3,Vidadillo,"The aromas of wet clay, mild chili peppers, ce..."
4,Athiri,"Spice, citrus and minerals typify the nose and..."
...,...,...
141,Bobal-Cabernet Sauvignon,"Mild, dusty cherry aromas set up a pulling, dr..."
142,Früburgunder,"Quite rare in the U.S. market, this ripe richl..."
143,Kotsifali,A luscious nose of chocolate-covered cherries ...
144,Pied de Perdrix,"Distantly related to Malbec, Pied de Perdrix i..."


In [17]:
df1 = df1.set_index('variety')
df3 = df3.set_index('variety')

Combine varieties with more than one descriptions and only one description to a single dataframe

In [18]:
df1 = df1.append(df3,ignore_index=False)

In [19]:
df1

Unnamed: 0_level_0,description
variety,Unnamed: 1_level_1
Pinot Noir,"pinot noir, black cherry, cherry fruit, raspbe..."
Chardonnay,"buttered toast, tropical fruit, fruit flavors,..."
Cabernet Sauvignon,"black cherry, black currant, cabernet sauvigno..."
Red Blend,"cabernet sauvignon, black cherry, palate offer..."
Sauvignon Blanc,"sauvignon blanc, passion fruit, tropical fruit..."
...,...
Bobal-Cabernet Sauvignon,"Mild, dusty cherry aromas set up a pulling, dr..."
Früburgunder,"Quite rare in the U.S. market, this ripe richl..."
Kotsifali,A luscious nose of chocolate-covered cherries ...
Pied de Perdrix,"Distantly related to Malbec, Pied de Perdrix i..."


In [20]:
# instantiating and generating the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(df1['description'])

In [21]:
indices = pd.Series(df1.index)
indices[:5]

0            Pinot Noir
1            Chardonnay
2    Cabernet Sauvignon
3             Red Blend
4       Sauvignon Blanc
Name: variety, dtype: object

In [22]:
count_matrix.shape

(735, 6056)

In [23]:
from sklearn.metrics.pairwise import cosine_similarity

# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim

array([[1.        , 0.43112132, 0.72809609, ..., 0.246641  , 0.20387964,
        0.23927691],
       [0.43112132, 1.        , 0.46545866, ..., 0.16926637, 0.18755211,
        0.03493883],
       [0.72809609, 0.46545866, 1.        , ..., 0.25313204, 0.1727615 ,
        0.13380643],
       ...,
       [0.246641  , 0.16926637, 0.25313204, ..., 1.        , 0.37032804,
        0.35571892],
       [0.20387964, 0.18755211, 0.1727615 , ..., 0.37032804, 1.        ,
        0.26196842],
       [0.23927691, 0.03493883, 0.13380643, ..., 0.35571892, 0.26196842,
        1.        ]])

In [24]:
df1 = df1.reset_index()

In [32]:
def recommendations(variety_input, cosine_sim=cosine_sim):
    
    # Get the index of the input variety
    idx = indices[indices == variety_input].index[0]
    
    # Get and sort the pairwise similarity scores between the input variety and all the varieties
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Top 5 similarity scores and their corresponding variety indices
    sim_scores = sim_scores[1:6]
    variety_idx_list = [i[0] for i in sim_scores]
     
    # Create the output dataframe
    df_output = pd.DataFrame(columns=['Recommended Grape Varieties', 'Similarity Score', 'Top 7 Common Words'])
    
    for i in sim_scores: 
        # i[0] is the index of the variety
        variety_name = df1.iloc[i[0]]['variety']
        # Get top 7 common words in descriptions
        des = df1.iloc[i[0]]['description']
        # If in the variety list that have more than one descriptions
        if variety_name in multi_des:   
            des_split = des.split(',')
            key_words_list = des_split[:7]
            key_words_str = ', '.join(key_words_list)
        # If in the variety list that have only one description
        else:
            key_words_str = des
            
        new_row = {'Recommended Grape Varieties':variety_name,'Similarity Score':i[1],'Top 7 Common Words':key_words_str}
        df_output = df_output.append(new_row, ignore_index=True)
    
    df_output.set_index('Recommended Grape Varieties') 
    pd.set_option('max_colwidth', 500)
   
    return df_output

Examples:

In [33]:
recommendations('Red Blend')

Unnamed: 0,Recommended Grape Varieties,Similarity Score,Top 7 Common Words
0,Sangiovese,0.833785,"black cherry, palate offers, palate delivers, lead nose, grained tannins, blue flower, white pepper"
1,Barbera,0.804758,"black cherry, palate offers, barbera alba, fruit flavors, palate doles, skinned berry, grained tannins"
2,Aglianico,0.77897,"black cherry, palate offers, black fruit, blue flower, palate delivers, black pepper, opens aromas"
3,Cabernet Sauvignon-Merlot,0.77581,"cabernet sauvignon, sauvignon merlot, black cherry, merlot blend, blend cabernet, cabernet merlot, finish drink"
4,Cabernet Franc,0.757077,"cabernet franc, cab franc, black cherry, fruit flavors, cherry flavors, medium bodied, red cherry"


In [34]:
recommendations('Pinot Noir')

Unnamed: 0,Recommended Grape Varieties,Similarity Score,Top 7 Common Words
0,Merlot,0.886208,"black cherry, fruit flavors, cherry flavors, red fruit, cherry fruit, medium bodied, cabernet sauvignon"
1,Grenache,0.882202,"cherry fruit, red fruit, fruit flavors, red cherry, black cherry, white pepper, nose bottling"
2,Cabernet Franc,0.879778,"cabernet franc, cab franc, black cherry, fruit flavors, cherry flavors, medium bodied, red cherry"
3,Zinfandel,0.766121,"high alcohol, black pepper, spice flavors, petite sirah, wild berry, fruit flavors, dry creek"
4,Mourvèdre,0.76416,"black pepper, black cherry, fruit flavors, white pepper, medium bodied, black plum, 100 varietal"
