In [3]:
import numpy as np 
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
path = "/Users/ashutoshwagh/Desktop/projects/wines/winedata.xlsx"

In [5]:
df = pd.read_excel(path)

In [7]:
df["text_reviews"] = df['description']

In [8]:
df["text_reviews"]

0       Soft, dry and flashy, this has pie-filling fla...
1       Smells and tastes overoaked, with toothpicky v...
2       A soft, very open, lightweight wine, that has ...
3       Exceptionally fine. Effortlessly combines the ...
4       Fleshy peach and melon aromas comme off as chu...
                              ...                        
1016    Nutty and melon-scented, this is a medium-bodi...
1017    This rich, opulent and amazingly intense Chard...
1018    The vintage was particularly successful for Pa...
1019    The style is full in the mouth, showing vanill...
1020    This is a strong, oaky, ripe Chardonnay, grown...
Name: text_reviews, Length: 1021, dtype: object

In [9]:
nltk.download("stopwords")
stopwords=stopwords.words('english')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ashutoshwagh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
def text_process(text):
    text = re.sub('[^a-z\s]','', text.lower())
    text = [word for word in text.split(" ") if word not in stopwords]
    return " ".join(text)

In [11]:
df['text_reviews'] = df['text_reviews'].apply(text_process)

In [12]:
stemmer = SnowballStemmer("english")
analize = CountVectorizer().build_analyzer()

In [13]:
def stem_text(text):
    return (stemmer.stem(word) for word in analize(text))

In [14]:
count = CountVectorizer(analyzer = stem_text)

In [15]:

count_matrix = count.fit_transform(df['text_reviews'])

In [16]:
print(count_matrix)

  (0, 2535)	1
  (0, 817)	1
  (0, 1043)	1
  (0, 2030)	1
  (0, 1046)	1
  (0, 264)	1
  (0, 469)	1
  (0, 1276)	1
  (0, 55)	1
  (0, 2525)	1
  (0, 2583)	1
  (0, 1033)	1
  (0, 2769)	1
  (0, 2698)	1
  (0, 1947)	1
  (0, 2622)	1
  (1, 469)	1
  (1, 2525)	1
  (1, 2523)	1
  (1, 2779)	1
  (1, 1927)	1
  (1, 2878)	1
  (1, 3015)	1
  (1, 450)	1
  (1, 1842)	1
  :	:
  (1020, 456)	1
  (1020, 386)	1
  (1020, 2466)	1
  (1020, 1520)	1
  (1020, 2293)	1
  (1020, 1983)	1
  (1020, 1863)	1
  (1020, 2036)	1
  (1020, 1387)	1
  (1020, 660)	1
  (1020, 1557)	1
  (1020, 2776)	1
  (1020, 1964)	1
  (1020, 1514)	1
  (1020, 2688)	1
  (1020, 242)	1
  (1020, 2669)	1
  (1020, 1763)	1
  (1020, 3091)	1
  (1020, 1237)	1
  (1020, 798)	1
  (1020, 1647)	1
  (1020, 556)	1
  (1020, 1704)	1
  (1020, 1537)	1


In [17]:
from sklearn.feature_extraction.text import TfidfTransformer

In [18]:
tf_idf = TfidfTransformer()

In [19]:
train_tfidf = tf_idf.fit_transform(count_matrix)

In [20]:
print(train_tfidf.shape)

(1021, 3233)


In [53]:
#Return top 3 most relative
from sklearn.metrics.pairwise import cosine_similarity
def search(query):
    query = text_process(query)
    query_matrix = count.transform([query])
    query_tfidf = tf_idf.transform(query_matrix)
    sim_score = cosine_similarity(query_tfidf,train_tfidf)
    sorted_indexes = np.argsort(sim_score).tolist()
    return sorted_indexes[0][-3:]
    
    
    

In [54]:
print(search("sweet aroma"))

[984, 170, 501]
