In [2]:
# Load Python libraries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import random

In [104]:
beer = pd.read_csv('beers-cleaned.csv',encoding="ISO-8859-1", low_memory = "False")
br = pd.read_csv('breweries.csv', encoding="ISO-8859-1", low_memory = "False")
reviews = pd.read_csv('beer_reviews.csv', encoding="ISO-8859-1", low_memory = "False")

  interactivity=interactivity, compiler=compiler, result=result)


In [105]:
reviews.head()

Unnamed: 0,review_profilename,beer_name,review_overall
0,stcules,Sausa Weizen,1.5
1,stcules,Red Moon,3.0
2,stcules,Black Horse Black Beer,3.0
3,stcules,Sausa Pils,3.0
4,johnmichaelsen,Cauldron DIPA,4.0


In [108]:
list(reviews.columns)

['review_profilename', 'beer_name', 'review_overall']

In [109]:
ratings = reviews[["review_profilename","beer_name","review_overall"]]
ratings.head()

Unnamed: 0,review_profilename,beer_name,review_overall
0,stcules,Sausa Weizen,1.5
1,stcules,Red Moon,3.0
2,stcules,Black Horse Black Beer,3.0
3,stcules,Sausa Pils,3.0
4,johnmichaelsen,Cauldron DIPA,4.0


In [110]:
ratings["User_ID"] = pd.Categorical((pd.factorize(ratings.review_profilename)[0] + 1))
ratings["Item_ID"] = pd.Categorical((pd.factorize(ratings.beer_name)[0] + 1))
ratings = ratings.drop(columns="review_profilename")
ratings = ratings.drop(columns="beer_name")
ratings.head()

Unnamed: 0,review_overall,User_ID,Item_ID
0,1.5,1,1
1,3.0,1,2
2,3.0,1,3
3,3.0,1,4
4,4.0,2,5


In [8]:
rating_col, user_col, item_col = ratings.columns

In [9]:
def list_2_dict(id_list:list):
    d={}
    for id, index in zip(id_list, range(len(id_list))):
        d[id] = index
    return d

In [10]:
# splits ratings dataframe to training and validation dataframes
def get_data(ratings, valid_pct:float = 0.2):
    # shuffle the indexes
    ln = random.sample(range(0, len(ratings)), len(ratings))
    
    # split based on the given validation set percentage 
    part = int(len(ln)*valid_pct)
    
    valid_index = ln[0:part]
    train_index = ln[part:]
    valid = ratings.iloc[valid_index]
    train = ratings.iloc[train_index]
    return [train,valid]

In [11]:
def get_batch(ratings, start:int, end:int):
    return ratings[user_col][start:end].values, ratings[item_col][start:end].values, ratings[rating_col][start:end].values


In [26]:
# Reference: https://towardsdatascience.com/make-your-own-recommendation-system-b596d847296d
# neural net based on Embedding matrices
# model reference -> https://github.com/fastai/fastai/
class EmbeddingModel(nn.Module):
    def __init__(self, n_factors, n_users, n_items, y_range, initialise = 0.01):
        super().__init__()
        self.y_range = y_range
        self.u_weight = nn.Embedding(n_users, n_factors)
        self.i_weight = nn.Embedding(n_items, n_factors)
#         self.u_bias = nn.Embedding(n_users, 1)
        self.i_bias = nn.Embedding(n_items, 1)
        
        # initialise the weights of the embeddings
        self.u_weight.weight.data.uniform_(-initialise, initialise)
        self.i_weight.weight.data.uniform_(-initialise, initialise)
#         self.u_bias.weight.data.uniform_(-initialise, initialise)
        self.i_bias.weight.data.uniform_(-initialise, initialise)

    def forward(self, users, items):
        # dot multiply the weights for the given user_id and item_id
        dot = self.u_weight(users)* self.i_weight(items)
        
        # sum the result of dot multiplication above and add both the bias terms
        res = dot.sum(1)+ self.i_bias(items).squeeze()# + self.u_bias(users).squeeze() + self.i_bias(items).squeeze()
        
        # return the output in the given range
        return torch.sigmoid(res) * (self.y_range[1]-self.y_range[0]) + self.y_range[0]

In [27]:
def train(epochs = 5, bs = 64):
    for epoch in range(epochs):
        
        # training the model
        i=0
        total_loss = 0.0
        ct = 0
        while i < len(data[0]):
            x1,x2,y = get_batch(data[0],i,i+bs)
            i+=bs
            ct+=1
            user_ids = torch.LongTensor([user2idx[u] for u in x1]).cuda()
            item_ids = torch.LongTensor([item2idx[b] for b in x2]).cuda()
            y = torch.Tensor(y).cuda()
            # disregard/zero the gradients from previous computation
            model.zero_grad() 
            preds = model(user_ids,item_ids)
            loss = loss_function(preds, y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        total_loss /= ct
        
        # getting the loss on validation set
        i = 0
        total_val_loss = 0.0
        cv=0
        m = model.eval() # setting the model to evaluation mode
        while i < len(data[1]):
            x11,x21,y1 = get_batch(data[1],i,i+bs)
            i+=bs
            cv+=1
            user_ids = torch.LongTensor([user2idx[u] for u in x11]).cuda()
            item_ids = torch.LongTensor([item2idx[b] for b in x21]).cuda()
            y1 = torch.Tensor(y1).cuda()
            preds = m(user_ids,item_ids)
            loss = loss_function(preds, y1)
            total_val_loss += loss.item()
        total_val_loss /= cv
        
        print('epoch', epoch+1, '   train loss', "%.3f" % total_loss, 
              '   valid loss', "%.3f" % total_val_loss)

In [28]:
def recommend_item_for_user(model, user_id):
    m = model.eval().cpu()
    user_ids = torch.LongTensor([user2idx[u] for u in [user_id]*len(items)])
    item_ids = torch.LongTensor([item2idx[b] for b in items])
    remove = set(ratings[ratings[user_col] == user_id][item_col].values)
    preds = m(user_ids,item_ids).detach().numpy()
    pred_item = [(p,b) for p,b in sorted(zip(preds,items), reverse = True) if b not in remove]
    return pred_item

def recommend_user_for_item(model, item_id):
    m = model.eval().cpu()
    user_ids = torch.LongTensor([user2idx[u] for u in users])
    item_ids = torch.LongTensor([item2idx[b] for b in [item_id]*len(users)])
    remove = set(ratings[ratings[item_col] == item_ids][user_col].values)
    preds = m(user_ids,item_ids).detach().numpy()
    pred_user = [(p,u) 
                 for p,u in sorted(zip(preds,users), reverse = True) if u not in remove]
    return pred_user

In [71]:
# get list of unique user ids
users = sorted(list(set(ratings[user_col].values)))

# get list of unique item ids
items = sorted(list(set(ratings[item_col].values)))

# generate dict of correponding indexes for the user ids
user2idx = list_2_dict(users)

# generate dict of correponding indexes for the item ids
item2idx = list_2_dict(items)

In [72]:
# create a model object
# y_range has been extended(0-11) than required(1-10) to make the
# values lie in the linear region of the sigmoid function
model = EmbeddingModel(5, len(users), len(items), [0,6], initialise = 0.01).cuda()

# split the data, returns a list [train, valid]
data = get_data(ratings, 0.25)

# loss = mean((target_rating - predicted_rating)**2)
loss_function = nn.MSELoss()

# optimizer function will update the weights of the Neural Net
optimizer = optim.SGD(model.parameters(), lr=0.003, momentum=0.9)

# batch size for each input
bs = 128

In [31]:
train(1, bs)

epoch 1    train loss 0.933    valid loss 0.787


In [58]:
train(5, bs)

epoch 1    train loss 0.932    valid loss 0.789
epoch 2    train loss 0.723    valid loss 0.678
epoch 3    train loss 0.645    valid loss 0.623
epoch 4    train loss 0.601    valid loss 0.589
epoch 5    train loss 0.572    valid loss 0.565


In [73]:
train(15, bs)

epoch 1    train loss 0.932    valid loss 0.789
epoch 2    train loss 0.723    valid loss 0.678
epoch 3    train loss 0.645    valid loss 0.622
epoch 4    train loss 0.601    valid loss 0.588
epoch 5    train loss 0.572    valid loss 0.565
epoch 6    train loss 0.551    valid loss 0.548
epoch 7    train loss 0.536    valid loss 0.534
epoch 8    train loss 0.523    valid loss 0.524
epoch 9    train loss 0.513    valid loss 0.515
epoch 10    train loss 0.505    valid loss 0.507
epoch 11    train loss 0.498    valid loss 0.501
epoch 12    train loss 0.491    valid loss 0.495
epoch 13    train loss 0.486    valid loss 0.490
epoch 14    train loss 0.481    valid loss 0.486
epoch 15    train loss 0.477    valid loss 0.482


In [47]:
#pd.DataFrame(recommend_user_for_item(model, 2),columns=["Ranking","User_ID"])

In [32]:
#pd.DataFrame(recommend_item_for_user(model, item_id_df.loc[item_id_df['beer_name'] == "Old Cherry"]
                                     ["Item_ID"].values[0]),columns=["Ranking","Item_ID"]).head(10)

NameError: name 'item_id_df' is not defined

In [49]:
#pd.unique(ratings["Item_ID"])

In [33]:
ratings2 = reviews[["review_profilename","beer_name","review_overall"]]
ratings2["Item_ID"] = pd.Categorical((pd.factorize(ratings2.beer_name)[0] + 1))
ratings2 = ratings2[["beer_name","Item_ID"]]
ratings2["Item_ID"] = ratings2["Item_ID"].subtract(len(ratings2["Item_ID"])*[1])
ratings2 = ratings2.drop_duplicates(keep='last')
Item_Name=list(ratings2["beer_name"].values)
ItemNameDict = list_2_dict(Item_Name)
df = pd.DataFrame.from_dict(ItemNameDict,orient='index')
df["beer_name"] = df.index
item_id_df = df.reset_index(drop=True)
item_id_df = item_id_df.rename(columns={0:"Item_ID"})
item_id_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,Item_ID,beer_name
0,0,Sausa Weizen
1,1,Red Moon
2,2,Black Horse Black Beer
3,3,Sausa Pils
4,4,Cauldron DIPA


In [34]:
#len(ratings[ratings["Item_ID"] == 10])/len(ratings)

In [99]:
Model_df = pd.DataFrame(recommend_user_for_item(model, 
           item_id_df.loc[item_id_df['beer_name'] == "Old Cherry"]
           ["Item_ID"].values[0]),columns=["Ranking","Item_ID"]).tail(10)

In [100]:
Model_df = Model_df.merge(item_id_df, how="inner")
Model_df.sort_values(by="Ranking")

Unnamed: 0,Ranking,Item_ID,beer_name
9,3.039425,28550,2010 Reserve Old Boardhead Barley Wine Ale
8,3.03943,5196,Batch #69 Double Cream Ale - Tangerine
7,3.039436,7711,Ysta FÃ¤rskÃ¶l
6,3.039439,21725,Piwo Twierdzowe
5,3.039441,7639,Greene's Gargolye Amber
4,3.039454,10352,Berliner BÃ¼rgerbrÃ¤u Maibock
3,3.039456,22292,Beck's Green Lemon
2,3.039462,6574,Green Flash Super Freak
1,3.03947,14177,Sean's Super Nugget IPA
0,3.039471,21950,Sun Woo Kong Imperial Monkey IPA


In [53]:
#item_id_df.loc[item_id_df['Item_ID'] == 31944]

In [54]:
Check_df = pd.DataFrame(columns = ["Item_ID","beer_name"])
for x in Model_df["Item_ID"]:
    Check_df = Check_df.append(item_id_df.loc[item_id_df['Item_ID'] == x])

In [55]:
Check_df

Unnamed: 0,Item_ID,beer_name
53522,53522,Winter Nip Porter
23747,23747,Highland Kashmir IPA
1704,1704,The Dominator Dopplebock
14306,14306,Braugold Premium Lager
13305,13305,Hiroyuki Yuzu Marma-ale
53937,53937,Southport Pequot IPA
34113,34113,Barrel-Aged Orbiter Strong Ale
33350,33350,Sapporo Fuyumonogatari - The Winter's Tale
47703,47703,White Cap Wheat
54012,54012,El Toro William Jones Wheat Beer


In [205]:
#Extract Value from a row in a dtaframe
#item_id_df.loc[item_id_df['beer_name'] == "Sausa Weizen"]["Item_ID"].values[0]