1. Import Libraries

In [14]:
import pandas as pd
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import Word2Vec, WordEmbeddingSimilarityIndex, KeyedVectors, TfidfModel
from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix
from gensim.corpora import Dictionary
from gensim.models.phrases import Phrases, Phraser
import json
from pprint import pprint

2 Load Dataset

In [18]:
#load dataset
df = pd.read_csv('winemag-data-130k-v2.csv',index_col = 0, usecols = [0,1,2,4,5,6,11,12])# we are loading tyhe csv file into datasframe
df = df.dropna() #dropping the null data from  the dataframe
df.drop_duplicates(subset = "title", keep='first',inplace=True)  # we are removing the duplicate entries in titles
df.reset_index(inplace = True, drop = True)

#preprocess obvious wine variety substitution to get wine style 
pair_sub_dict = {'Alsace white blend':'Alsacian Pinot Gris', 'Syrah':'Syrah / Shiraz', 'Bordeaux-style Red Blend':'Bordeaux Blend', 
                   'Garganega':'Gargenega',  'Madeira Blend':'Madeira', 'Melon':'Melon de Bourgogne', 'Pedro Ximénez':'Pedro Ximenez',
                    'Pinot Grigio':'Pinot Gris / Pinot Grigio', 'Zinfandel':'Zinfanadel',
                'Shiraz': 'Syrah / Shiraz', 'Touriga': 'Touriga Nacional', 'Garnacha': 'Garnacha Rosado', 'Rosado':'Garnacha Rosado',
                'Syrah-Petite Sirah': "Petite Sirah", "Sauvignon-Sémillon":"Sémillon", 'Chardonnay-Albariño':"Albariño",
                'Vermentino Nero':'Vermentino', 'White Riesling':'Riesling',"Chardonnay-Riesling":'Riesling',"Riesling-Chardonnay":'Riesling',
                'Chenin Blanc-Sauvignon Blanc':'Chenin Blanc', "Sauvignon Blanc-Chenin Blanc":'Chenin Blanc', 'Chenin Blanc-Chardonnay':'Chenin Blanc',
                'Chenin Blanc-Viognier':'Chenin Blanc', 'Viognier-Gewürztraminer':'Gewürztraminer', 'Pinot Gris-Gewürztraminer':'Gewürztraminer',
                'Gewürztraminer-Riesling':'Gewürztraminer', 'White Port': 'Port', 'Tinta Madeira':'Madeira', 'Orange Muscat':'Muscat',
                'Muscat Hamburg':'Muscat', 'Muscat Canelli':'Muscat', "Muscat d'Alexandrie":'Muscat', 'Valvin Muscat':'Muscat', 'Muscat Blanc à Petits Grains':'Muscat',
                'Muscat of Alexandria':'Muscat', 'Muscat Blanc':'Muscat'}

variety_lst = []  # we are creating a list to keep variety
for i in df['variety'].to_list():
    if i in pair_sub_dict.keys():
        variety_lst.append(pair_sub_dict[i]) # we are validating the word enter is from pair_sub_dict dictionary 
    else:
        variety_lst.append(i)  # we are appending the word from df['variety'] features
        
df['variety'] = variety_lst  # we are updating the  df['variety']  with the  variety_lst list 


df.head() # display a sample dataframe from our processed data

Unnamed: 0,country,description,points,price,province,title,variety
0,Portugal,"This is ripe and fruity, a wine that is smooth...",87,15.0,Douro,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red
1,US,"Tart and snappy, the flavors of lime flesh and...",87,14.0,Oregon,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris
2,US,"Pineapple rind, lemon pith and orange blossom ...",87,13.0,Michigan,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling
3,US,"Much like the regular bottling from 2012, this...",87,65.0,Oregon,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir
4,Spain,Blackberry and raspberry aromas show a typical...,87,15.0,Northern Spain,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot


In [20]:
df.to_csv('wine_df.csv')  # we are saving our dataframe to wine_df as csv file format

3. Define a sentence tokenizer function

In [6]:
def get_tokenized_sentences(descriptions):  # defining a function to tokenized the sentences , it takes the descriptions given by user
    tokenized_sentences = []   # creating a list to keep the tokenize sentences
    for description in descriptions:   # we are breaking the descriptions enter by user into description and keeping it into description
        word_list = [] # creating a list to keep the list of words 
        description = description.lower()  # converting the description into lower case so that our system could knowledge the word and keeping it into description again
        doc = nlp(description) #  we are performing natural language processing of the description of word and keeping it into doc variable
        for word in doc:  # we are taking word by word and iterating word in doc
            if (word.is_alpha) and (word.is_stop == False): # we are checking the word whether it is alphabet or is not stop word
                word_list.append(word.lemma_) # appending the lemma into the list word list 
        tokenized_sentences.append(word_list)# appending the sentence based on lemma of word stored word_list list
    return tokenized_sentences # returning the tokenized sentences 

4.  Tokenize wine descriptions and add 2-gram and 3-gram phrases to vocabulary

In [105]:
#get sentences
wine_sentences = get_tokenized_sentences(df["description"].to_list()) # we are using our defined function to tokenize the descriptions and storing it into wine sentences list

#build phrases and add to vocab
phrases = Phrases(wine_sentences, min_count=1, threshold=2,delimiter = b' ')  # we are splitting the sentences into phrases based on space in the sentences 
phrases = Phrases(phrases[wine_sentences], min_count=1, threshold=2, delimiter = b' ')

ngrams = Phraser(phrases) #Create 2-gram and 3-gram phrases

#add phrases to vocab
phrased_sentences = []    # we are declaring list to store the phrased sentence
for sent in wine_sentences:  # iterating over wine sentences to get each sentence one by one
    phrased_sentence = list(set(ngrams[sent]).union(set(sent))) # # combinig ngram sentence and  each sentence together to get pharsed sentence
    phrased_sentences.append(phrased_sentence)# again  to get the sentences we appended the pharased sentence together

tagged_data = [TaggedDocument(words=word_lst, tags=[str(i)]) for i, word_lst in enumerate(phrased_sentences)]   

5. Train and save Doc2Vec model on full wine dataset

In [119]:
#train and save model on doc2vec vocab in full wine dataset
max_epochs = 100
vec_size = 20
alpha = 0.025

model = Doc2Vec(size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm =1)
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.iter)# trainning the data 
    model.alpha -= 0.0002
    model.min_alpha = model.alpha

model.save("d2v_wine.model") # saving the model

  # This is added back by InteractiveShellApp.init_path()


In [12]:
#use if model is already built
model= Doc2Vec.load("d2v_wine.model")  

6. Create individual datasets by style of wine (Bold Red, Medium Red, etc.) determined by the food/wine pairing dataset

In [5]:
#build datasets for specific wine styles
with open('wine_food_pairing.json', 'r') as f:
    food = json.load(f)
    
lst = []
for k,v in food.items(): # finding item in the food json file
    lst.append((k,list(v.keys())))

Bold_Red_df = df[(df["variety"].isin(lst[0][1]))]# Bold Red 
Bold_Red_df.reset_index(inplace = True, drop=True)
Bold_Red_df.to_csv('Bold_Red_df.csv') # saving to csv file

Medium_Red_df = df[(df["variety"].isin(lst[1][1]))]
Medium_Red_df.reset_index(inplace = True, drop=True)
Medium_Red_df.to_csv('Medium_Red_df.csv')

Light_Red_df = df[(df["variety"].isin(lst[2][1]))]
Light_Red_df.reset_index(inplace = True, drop=True)
Light_Red_df.to_csv('Light_Red_df.csv')

Rose_df = df[(df["variety"].isin(lst[3][1]))]
Rose_df.reset_index(inplace = True, drop=True)
Rose_df.to_csv('Rose_df.csv')

Rich_White_df = df[(df["variety"].isin(lst[4][1]))]
Rich_White_df.reset_index(inplace = True, drop=True)
Rich_White_df.to_csv('Rich_White_df.csv')

Light_White_df = df[(df["variety"].isin(lst[5][1]))]
Light_White_df.reset_index(inplace = True, drop=True)
Light_White_df.to_csv('Light_White_df.csv')

Sweet_White_df = df[(df["variety"].isin(lst[6][1]))]
Sweet_White_df.reset_index(inplace = True, drop=True)
Sweet_White_df.to_csv('Sweet_White_df.csv')

Dessert_df = df[(df["variety"].isin(lst[7][1]))]
Dessert_df.reset_index(inplace = True, drop=True)
Dessert_df.to_csv('Dessert_df.csv')


7. Tokenize the descriptions in each of the separate wine style datasets in order to build separate TF-IDF models for each style.

In [7]:
#build initial tokenized sentences with spacy
Bold_Red_sentences = get_tokenized_sentences(Bold_Red_df["description"].to_list())
Medium_Red_sentences = get_tokenized_sentences(Medium_Red_df["description"].to_list())
Light_Red_sentences = get_tokenized_sentences(Light_Red_df["description"].to_list())
Rose_sentences = get_tokenized_sentences(Rose_df["description"].to_list())
Rich_White_sentences = get_tokenized_sentences(Rich_White_df["description"].to_list())
Light_White_sentences = get_tokenized_sentences(Light_White_df["description"].to_list())
Sweet_White_sentences = get_tokenized_sentences(Sweet_White_df["description"].to_list())
Dessert_sentences = get_tokenized_sentences(Dessert_df["description"].to_list())

In [9]:
#in case the tokenized_sentences are needed in the future
sentence_list = [Bold_Red_sentences, Medium_Red_sentences, Light_Red_sentences, Rose_sentences,
                Rich_White_sentences, Light_White_sentences, Sweet_White_sentences, Dessert_sentences]

sentence_string_list = ['Bold_Red_sentences', 'Medium_Red_sentences', 'Light_Red_sentences', 'Rose_sentences',
                'Rich_White_sentences', 'Light_White_sentences', 'Sweet_White_sentences', 'Dessert_sentences']

for a, b in zip(sentence_list, sentence_string_list):
    with open(f'{b}.json', 'w') as f:
        json.dump(a, f)
    

8. Build separate TF-IDF models for each style. Save dictionary and similarity index for easy access

In [24]:
def doc_similarity_builder(style_sentences, style_name):
    phrases = Phrases(style_sentences, min_count=1, threshold=2,delimiter = b' ')
    phrases = Phrases(phrases[style_sentences], min_count=1, threshold=2, delimiter = b' ')

    ngrams = Phraser(phrases) #Create 2-gram and 3-gram phrases

    phrased_sentences = []
    for sent in style_sentences:
        phrased_sentence = list(set(ngrams[sent]).union(set(sent)))
        phrased_sentences.append(phrased_sentence)

    #create TFIDF
    dct = Dictionary(phrased_sentences) 
    corpus = [dct.doc2bow(sentence) for sentence in phrased_sentences]

    tfidf_model = TfidfModel(corpus) #tfidf

    word_vectors = model.wv #word vectors trained on the doc2vec model


    #TFIDF with weighted word vectors on pre-defined library 
    sim_matrix = word_vectors.similarity_matrix(dictionary = dct, tfidf = tfidf_model, threshold=0.0, exponent=2.0, nonzero_limit=100)

    #uses cosine similarity on pre-defined library for retrieval
    doc_sim_index = SoftCosineSimilarity(corpus, sim_matrix) 

    dct.save(f'{style_name}_dct.model')
    doc_sim_index.save(f'{style_name}_sim_index.model')

In [13]:
#build initial tokenized sentences with spacy
sentence_list = [wine_sentences, Bold_Red_sentences, Medium_Red_sentences, Light_Red_sentences, Rose_sentences, 
                 Rich_White_sentences, Light_White_sentences, Sweet_White_sentences, Dessert_sentences]

style_name_list = ['wine', 'Bold_Red', 'Medium_Red', 'Light_Red', 'Rose', 
                 'Rich_White', 'Light_White', 'Sweet_White', 'Dessert']


for style_sentences, style_name in zip(sentence_list, style_name_list):
    doc_similarity_builder(style_sentences, style_name)



Now that each of the TFIDF models are built we can begin to build the function that returns wine recommendations based on your preferences and food choices