In [1]:
#!pip install -U sentence-transformers
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np
import nltk
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer, util

In [7]:
#helper function
#removes punctuations and stop words
def process_sentences(sentences):
    stop_words = set(stopwords.words('english'))
    word_tokens = nltk.word_tokenize(sentences)
    tokenized_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    remove_punctuation = [word for word in tokenized_sentence if word.isalnum()]
    cleaned_text = ''
    for word in remove_punctuation:
        cleaned_text = cleaned_text +' ' + word 
    return cleaned_text

In [3]:
df = pd.DataFrame()
df = pd.read_csv('data/winemag-data-130k-v2.csv')

In [8]:
small_df = df[:10000].copy()
small_df.head(2)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos


In [7]:
small_df.price.fillna(small_df.price.median(), inplace=True)

In [10]:
selected_countries = ['US','France','Italy','Spain','Portugal','Chile','Argentina','Australia','Austria']
for i in range(len(small_df['country'])):
    if not (small_df['country'][i] in selected_countries):
        small_df.loc[i, 'country'] = 'Other'

small_df['points'] = pd.cut(small_df['points'], bins=[79,85,90,95,100], labels=['79-85','85-90','90-95','95-100'])

selected_provinces = ['California', 'Bordeaux', 'Washington','Oregon','Tuscany', 'Sicily & Sardinia','Mendoza Province','Piedmont','Northeastern Italy','Northern Spain' ]
for i in range(len(small_df['province'])):
    if not (small_df['province'][i] in selected_provinces):
        small_df.loc[i, 'province'] = 'Other'
        
small_df['price'] = pd.cut(small_df['price'], bins=[0,10,20,30,60,1000], labels=['0-10','10-20','20-30','30-60', '>60'])


In [None]:
useful_features = ['country','points','price','province' ]
temp = small_df[useful_features]
temp['any_country'] = 1
temp['any_points'] = 1
temp['any_price'] = 1
temp['any_province'] = 1
df1 = pd.get_dummies(temp, prefix='', prefix_sep='')

In [12]:
df1.columns

Index(['any_country', 'any_points', 'any_price', 'any_province', 'Argentina',
       'Australia', 'Austria', 'Chile', 'France', 'Italy', 'Other', 'Portugal',
       'Spain', 'US', '79-85', '85-90', '90-95', '95-100', '0-10', '10-20',
       '20-30', '30-60', '>60', 'Bordeaux', 'California', 'Mendoza Province',
       'Northeastern Italy', 'Northern Spain', 'Oregon', 'Other', 'Piedmont',
       'Sicily & Sardinia', 'Tuscany', 'Washington'],
      dtype='object')

In [13]:
#save the non descriptive part of the table for recommendation based on non descriptive inputs
df1.to_csv('non_description_matrix', index=False)

In [15]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [31]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\16177\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [16]:
#group together descriptions for encodeing and then encode
des_list = []
for des in small_df.description:
    des_list.append(process_sentences(des))
des_vec = model.encode(des_list)

In [34]:
#test the model on some toy inputs to see if it works
input_text = ' A vivid, but light wine with acidity.'
processed_input = model.encode(process_sentences(input_text))
#dotted_vec = np.dot(des_vec, processed_input)
dotted_vec = util.cos_sim(des_vec, processed_input)
index = np.array(dotted_vec).T[0].argsort()[-5:][::-1] #get top 3 matches

recommendation = small_df.loc[index].description
print(recommendation)

1542    This is a delicious and ripe-fruited wine. Dom...
1551    The blend of Fernão Pires and Arinto is a clas...
1623    This is a light bright wine, full of forward f...
1538    Light and fruity, this is a crisp wine with go...
1527    There is an attractive spiciness to this clean...
Name: description, dtype: object


In [35]:
small_df.description[1538]

'Light and fruity, this is a crisp wine with good acidity. a tangy character and a yeasty edge. Drink now.'

In [18]:
des_vec.shape

(10000, 768)

In [19]:
#save the final description encoding matrix which will be used to give recommendation based on the user description
final_description_matrix = pd.DataFrame(des_vec).to_csv('final_description_matrix', index=False)