### Importing the dataset

In [123]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import joblib

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [124]:
df = pd.read_csv('../dataset/XWines_with_Ratings.csv', low_memory=False)

In [125]:
df.head()

Unnamed: 0,RatingID,UserID,Rating,WineName,Vintage,ABV,Body,Acidity,Country,RegionName,WineryName,Harmonize1,Harmonize2,Harmonize3,Harmonize4,Harmonize5,Harmonize6,Harmonize7,Harmonize8,Harmonize9,Harmonize10,Harmonize11,Harmonize12,Grapes1,Grapes2,Grapes3,Grapes4,Grapes5,Grapes6,Grapes7,Grapes8,Grapes9,Elaborate1,Elaborate2,Type1,Type2
0,326545,1756594,4.0,Espumante Moscatel,1999,8.0,Mediumbodied,High,Brazil,Serra Gaúcha,Casa Perini,Pork,Rich Fish,Shellfish,,,,,,,,,,Muscat/Moscato,,,,,,,,,Varietal,100%,Sparkling,
1,1314107,1219305,2.5,Espumante Moscatel,2007,8.0,Mediumbodied,High,Brazil,Serra Gaúcha,Casa Perini,Pork,Rich Fish,Shellfish,,,,,,,,,,Muscat/Moscato,,,,,,,,,Varietal,100%,Sparkling,
2,1446366,2047929,3.5,Espumante Moscatel,2008,8.0,Mediumbodied,High,Brazil,Serra Gaúcha,Casa Perini,Pork,Rich Fish,Shellfish,,,,,,,,,,Muscat/Moscato,,,,,,,,,Varietal,100%,Sparkling,
3,1448872,1006545,5.0,Espumante Moscatel,2008,8.0,Mediumbodied,High,Brazil,Serra Gaúcha,Casa Perini,Pork,Rich Fish,Shellfish,,,,,,,,,,Muscat/Moscato,,,,,,,,,Varietal,100%,Sparkling,
4,1657104,1400823,2.0,Espumante Moscatel,2008,8.0,Mediumbodied,High,Brazil,Serra Gaúcha,Casa Perini,Pork,Rich Fish,Shellfish,,,,,,,,,,Muscat/Moscato,,,,,,,,,Varietal,100%,Sparkling,


In [126]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 36 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   RatingID     150000 non-null  int64  
 1   UserID       150000 non-null  int64  
 2   Rating       150000 non-null  float64
 3   WineName     150000 non-null  object 
 4   Vintage      150000 non-null  object 
 5   ABV          150000 non-null  float64
 6   Body         150000 non-null  object 
 7   Acidity      150000 non-null  object 
 8   Country      150000 non-null  object 
 9   RegionName   150000 non-null  object 
 10  WineryName   150000 non-null  object 
 11  Harmonize1   150000 non-null  object 
 12  Harmonize2   149778 non-null  object 
 13  Harmonize3   146425 non-null  object 
 14  Harmonize4   108107 non-null  object 
 15  Harmonize5   30055 non-null   object 
 16  Harmonize6   12094 non-null   object 
 17  Harmonize7   1109 non-null    object 
 18  Harmonize8   670 non-nul

### Combined the variables into one column

In [127]:
df = df.fillna("")

In [128]:
common_columns = ['WineName', 'Body', 'Acidity', 'Country', 'RegionName', 'WineryName']
grapes_columns = [f'Grapes{i}' for i in range(1, 10)]
harmonize_columns = [f'Harmonize{i}' for i in range(1, 13)]
type_columns = [f'Type{i}' for i in range(1, 2)]
elaborate_columns = [f'Elaborate{i}' for i in range(1, 2)]
all_columns = common_columns + grapes_columns + harmonize_columns + type_columns + elaborate_columns
df['Attributes'] = df[all_columns].astype(str).agg(' '.join, axis=1)

In [129]:
df['Attributes'].head()

0    Espumante Moscatel Mediumbodied High Brazil Serra Gaúcha Casa Perini Muscat/Moscato         Pork Rich Fish Shellfish          Sparkling Varietal
1    Espumante Moscatel Mediumbodied High Brazil Serra Gaúcha Casa Perini Muscat/Moscato         Pork Rich Fish Shellfish          Sparkling Varietal
2    Espumante Moscatel Mediumbodied High Brazil Serra Gaúcha Casa Perini Muscat/Moscato         Pork Rich Fish Shellfish          Sparkling Varietal
3    Espumante Moscatel Mediumbodied High Brazil Serra Gaúcha Casa Perini Muscat/Moscato         Pork Rich Fish Shellfish          Sparkling Varietal
4    Espumante Moscatel Mediumbodied High Brazil Serra Gaúcha Casa Perini Muscat/Moscato         Pork Rich Fish Shellfish          Sparkling Varietal
Name: Attributes, dtype: object

### Remove duplicated rows in Attributes and WineName

In [130]:
display(f'Duplicated in Attributes: {df["Attributes"].duplicated().sum()}')
display(f'Duplicated in WineName: {df["WineName"].duplicated().sum()}')
display(f'Shape: {df.shape}')

'Duplicated in Attributes: 148993'

'Duplicated in WineName: 149197'

'Shape: (150000, 37)'

In [131]:
df.drop_duplicates(subset=['Attributes'], inplace=True)
df.drop_duplicates(subset=['WineName'], inplace=True)
df.reset_index(drop=True, inplace=True)

In [132]:
display(f'Duplicated in Attributes: {df["Attributes"].duplicated().sum()}')
display(f'Duplicated in WineName: {df["WineName"].duplicated().sum()}')
display(f'Shape: {df.shape}')

'Duplicated in Attributes: 0'

'Duplicated in WineName: 0'

'Shape: (803, 37)'

### Converting 'Attributes' into a TF-IDF Matrix

In [133]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['Attributes'])
tfidf_matrix.shape

(803, 2167)

### Computing Cosine Similarity between Wines based on TF-IDF Matrix

In [134]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [135]:
def get_recommendations(WineName):

    recommendation = {}

    idx = df.index[df['WineName'] == WineName].tolist()[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    wine_indices = [i[0] for i in sim_scores]

    recommendation = {
        'WineName': df['WineName'].iloc[wine_indices],
        'Score': [i[1] for i in sim_scores]
    }

    return pd.DataFrame(recommendation)

In [136]:
name = df['WineName'].sample(1).values[0]

In [137]:
display(f'Recommendation for: {name}')
get_recommendations(name)

'Recommendation for: Vintage Reserve Chardonnay'

Unnamed: 0,WineName,Score
660,Heritage Reserve Chardonnay,0.588803
673,Reserve Chardonnay,0.570574
646,Vintner s Reserve Chardonnay,0.520297
665,Brut Vintage,0.41073
674,Private Collection Chardonnay,0.384664
46,Brut Classic,0.358627
659,California Champagne Brut,0.35665
693,Estate Chardonnay,0.346031
549,Chardonnay,0.330054
688,Aquarius Ranch Vineyard Chardonnay,0.313624


### Exporting the model

In [138]:
joblib.dump((cosine_sim, df), '../models/content_based_recommender_model.pkl')

['../models/content_based_recommender_model.pkl']