In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import linear_kernel
from fuzzywuzzy import process

ModuleNotFoundError: No module named 'fuzzywuzzy'

In [2]:
df = pd.read_csv('cleaned_user_posts.csv')

In [3]:
def clean_text(text):
    # Check if the text is a string
    if not isinstance(text, str):
        return ""  # Return empty string if not, or handle as needed
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\[.*?\]', '', text)  # Remove text in brackets
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(r'[%s]' % re.escape('!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'), '', text)  # Remove punctuation
    text = re.sub(r'\n', '', text)  # Remove line breaks
    text = re.sub(r'\w*\d\w*', '', text)  # Remove words containing numbers
    return text


In [4]:
# Apply cleaning function to comments column (adjusting for potential non-string values)
df['cleaned_comments'] = df['comments'].apply(lambda x: clean_text(x))

In [5]:
# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=10000)

# Fit and transform the cleaned reviews
X_tfidf = tfidf.fit_transform(df['cleaned_comments'])

In [6]:
# KMeans
kmeans = KMeans(n_clusters=5, random_state=2)
kmeans.fit(X_tfidf)

# Assign the cluster labels back to the dataframe
df['cluster'] = kmeans.labels_



In [7]:
def recommend_drinks(review):
    # Clean and transform the input review
    cleaned_review = clean_text(review)
    transformed_review = tfidf.transform([cleaned_review])
    
    # Predict the cluster for the input review
    cluster = kmeans.predict(transformed_review)[0]
    
    # Use the 'title' column as it contains the names/titles of the drinks
    recommendations = df[df['cluster'] == cluster]['title'].unique()
    
    return recommendations

# Example usage
example_review = "A rich nose with fruit and vanilla, smooth palate, and a spicy finish."
print(recommend_drinks(example_review))


['the whisky club referral code  get a free dram'
 '107th whisky review 52nd scotch whisky review  glengoyne 12'
 '105th whisky review 6th japanese whisky review  nikka coffey grain'
 '100th whisky review 21st xmas23 review  roe  co blended irish whiskey'
 '99th whiskey review 20th xmas23 review  few spirits straight bourbon whiskey'
 '95th whisky review 16th xmas23 review  kyrö malt rye'
 '87th whiskey review 8th xmas23 review  black and gold 11 year old bourbon whiskey'
 '86th whisky review 7th xmas23 review  burnt ends blended whiskey'
 '81st whisky review 2nd xmas23 review  ledaig sinclair rioja finish'
 '78th whisky review 4th japanese whisky review  fuji 2022 masterpiece'
 '75th whiskey review 1st american whiskey review  buffalo trace'
 '70th whiskey review 3rd indian whisky review  neidhal single malt'
 '68th whiskey review 5th irish whisky review  puca small batch'
 'review cut hill keystone release batch 1'
 'review 13 tasman whisky pinot cask australian review 12'
 'review 1

In [8]:
# Ensure this list is correctly populated from your DataFrame
titles = df['title'].tolist()

# Initialize and transform titles with TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(titles)

# Compute cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

def recommend_drinks_fuzzy(title, titles, cosine_sim=cosine_sim):
    # Use fuzzy matching to find the closest title
    closest_title, _ = process.extractOne(title, titles)
    
    # Get the index of the drink that matches the closest title
    idx = titles.index(closest_title)
    
    # Get the pairwise similarity scores of all drinks with that drink
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the drinks based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 5 most similar drinks
    sim_scores = sim_scores[1:6]
    
    # Get the drink indices
    drink_indices = [i[0] for i in sim_scores]
    
    # Return the top 5 most similar drinks
    return [titles[i] for i in drink_indices]

# Make sure to replace `example_title` with a title that exists in your `titles` list
example_title = '9th whisky review 1st irish whiskey review redbreast 12'
print(recommend_drinks_fuzzy(example_title, titles))

['9th whisky review 1st irish whisky review  redbreast 12', '9th whisky review 1st irish whisky review  redbreast 12', '68th whiskey review 4th irish whisky review  teeling single malt', '68th whiskey review 4th irish whisky review  teeling single malt', '68th whiskey review 4th irish whisky review  teeling single malt']
