In [0]:
import pandas as pd
import spacy
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter
import itertools
import pickle

nlp = spacy.load('en')

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
# Loading the nevada restauarnt review data
with open('/content/drive/My Drive/Dataset/nv_restaurants_reviews.pkl', 'rb') as f:
  nv_restaurants_reviews = pickle.load(f)

In [0]:
# Taking the columns required
review_df_restaurant = nv_restaurants_reviews.loc[:, ['business_id', 'user_id','stars','text']]
len(review_df_restaurant)

In [0]:
#Findinf unique combination of the business user ids
review_df_restaurant_avg=review_df_restaurant.groupby(['business_id', 'user_id'], as_index=False).agg({"stars": "mean"})

In [0]:
#Finding business with reviews more than 50
BusinessGreaterThan50Reviews = review_df_restaurant_avg.groupby('business_id').filter(lambda x: x['business_id'].count()>50)
BusinessGreaterThan50Reviews.business_id.nunique()

In [0]:
#Finding business with reviews more than 20
UsersGreaterThan20Reviews = BusinessGreaterThan50Reviews.groupby('user_id').filter(lambda x: x['user_id'].count()> 20) 
UsersGreaterThan20Reviews.user_id.nunique()   

In [0]:
UsersGreaterThan20Reviews = UsersGreaterThan20Reviews.user_id.unique()
UsersGreaterThan20Reviews

In [0]:
BusinessGreaterThan50Reviews = BusinessGreaterThan50Reviews.business_id.unique()
BusinessGreaterThan50Reviews

In [0]:
nv_restaurants_reviews = nv_restaurants_reviews[nv_restaurants_reviews.user_id.isin(UsersGreaterThan20Reviews)]

nv_restaurants_reviews = nv_restaurants_reviews[nv_restaurants_reviews.business_id.isin(BusinessGreaterThan50Reviews)]

In [0]:
nv_restaurants_reviews = nv_restaurants_reviews.drop(['cool','date','funny','useful','review_id'],axis = 1) # Dropping the not required columns

In [0]:
nv_restaurants_reviews.head()

In [0]:
#finding unique user and business ids
unique_user_id = nv_restaurants_reviews['user_id'].unique()
unique_business_id = nv_restaurants_reviews['business_id'].unique()

In [0]:
# creating a numeric representation of the ids
unique_user_id_dict = dict((id, i) for (i, id) in enumerate(unique_user_id,1))
unique_business_id_dict = dict((id, i) for (i, id) in enumerate(unique_business_id,1))

In [0]:
#Replacing the alphanumeric ids with numeric representation
nv_restaurants_reviews['user_id'] = list(map(lambda x: unique_user_id_dict[x], nv_restaurants_reviews['user_id']))
nv_restaurants_reviews['business_id'] = list(map(lambda x: unique_business_id_dict[x], nv_restaurants_reviews['business_id']))
nv_restaurants_reviews.head(10)

In [0]:
nv_restaurants_reviews = nv_restaurants_reviews[['user_id','business_id','stars','text']]
nv_restaurants_reviews.head()

In [0]:
# Tokenization and cleaning of reviews
def clean(s):
  
  string = nlp(s)
  words = [token.text for token in string
         if token.is_stop != True and token.is_punct != True and token.is_alpha == True]
  return words

In [0]:
# Main tokenization function
def tokenization (data):
  
  text_token = []
  i = data.shape[0]
  for j in data.values:
    if(i%1000 == 0):
      print(i)
    i -= 1
    text_token.append(clean(j[3]))
    

  return(text_token)

In [0]:
text_token = tokenization(nv_restaurants_reviews)
nv_restaurants_reviews["text_token"] = text_token
nv_restaurants_reviews

In [0]:
nv_restaurants_reviews.shape

In [0]:
with open('/content/drive/My Drive/nv_restaurants_reviews_filtered.pkl', 'wb') as f:
  pickle.dump(nv_restaurants_reviews,f)

In [0]:
with open('/content/drive/My Drive/nv_restaurants_reviews_filtered.pkl', 'rb') as f:
  nv_restaurants_reviews = pickle.load(f)

In [0]:
with open('/content/drive/My Drive/Dataset/user_business_id_test.pkl', 'rb') as f: # Business ids and user ids used for test in task1 part A
  user_business_id_test = pickle.load(f)

In [0]:
len(user_business_id_test)

In [0]:
# Converting the alphanumeric  business ids and user ids from task1 part A to their respective numeric representation
business_test_ids = [i[1] for i in user_business_id_test]
users_test_ids = [i[0] for i in user_business_id_test]

business_test =  []
user_test = []

for i in business_test_ids:
  if i in  unique_business_id_dict:
    business_test.append(unique_business_id_dict[i])

for i in users_test_ids:
  if i in unique_user_id_dict:
    user_test.append(unique_user_id_dict[i])

In [0]:
#Filtering the data for the business ids and user ids combination to create train and test data similar to task 1 part A
nv_restaurants_reviews['tuples'] =  list(zip(nv_restaurants_reviews.user_id,nv_restaurants_reviews.business_id))
user_business_test =  list(zip(user_test,business_test))
validation = nv_restaurants_reviews[nv_restaurants_reviews.tuples.isin(user_business_test)]
train = nv_restaurants_reviews[~nv_restaurants_reviews.tuples.isin(user_business_test)]

In [0]:
train_validation = train.append(validation)
train_validation.head()

In [0]:
del unique_business_id
del unique_user_id
del unique_business_id_dict
del unique_user_id_dict

In [0]:
#Collating all the reviews and user/business ids for user and business
user_reviews={}
business_reviews={}
user_business_id={}
business_user_id={}

for i in train_validation.values:
    if i[0] in user_reviews:
        user_reviews[i[0]].append(i[4])
        user_business_id[i[0]].append(i[1])
    else:
        user_business_id[i[0]]=[i[1]]
        user_reviews[i[0]]=[i[4]]

    if i[1] in business_reviews:
        business_reviews[i[1]].append(i[4])
        business_user_id[i[1]].append(i[0])
    else:
        business_reviews[i[1]] = [i[4]]
        business_user_id[i[1]]=[i[0]]


# for i in test.values:
#     if i[0] not in user_reviews:
#         user_business_id[i[0]]=[0]
#         user_reviews[i[0]]=[['0']]
#     if i[1] not in business_reviews:
#         business_reviews[i[1]] = [['0']]
#         business_user_id[i[1]]=[0]

In [0]:
del train_validation

In [0]:
#Collating all the reviews and user/business ids for user and business for train data
user_business_id_train = []
business_user_id_train = []
user_id_train = []
business_id_train = []
y_train = []

u_text = {}

b_text = {}


for j in train.values:

  user_id_train.append(j[0])
  business_id_train.append(j[1])  

  if j[0] not in u_text:
    u_text[j[0]] = []
    for k in user_reviews[j[0]]:
      u_text[j[0]].append(k)
    
  user_business_id_train.append(user_business_id[j[0]])

  if j[1] not in b_text:
    b_text[j[1]] = []
    for k in business_reviews[j[1]]:
      b_text[j[1]].append(k)

  business_user_id_train.append(business_user_id[j[1]])
  
  y_train.append(j[2])

In [0]:
#Collating all the reviews and user/business ids for user and business for validation data
user_business_id_valid = []
business_user_id_valid = []
user_id_valid = []
business_id_valid = []
y_valid = []


i = validation.shape[0]
for j in validation.values:

  

  user_id_valid.append(j[0])
  business_id_valid.append(j[1])

  if j[0] in u_text:
    user_business_id_valid.append(user_business_id[j[0]])

  else:
    u_text[j[0]] = [['<PAD/>']] ## If the user is not in train data then use padding as reviews
    user_business_id_train.append([0]) ## If the user is not in train data then use padding as business ids

  if j[1] in b_text:
    business_user_id_valid.append(business_user_id[j[1]])

  else:
    b_text[j[1]] = [['<PAD/>']] ## If the business is not in train data then use padding as reviews
    business_user_id_valid.append([1]) ## If the business is not in train data then use padding as user ids

  y_valid.append(j[2])

In [0]:
del train, validation

In [0]:
# Taking 90th percentile as the max lenght for reviews and collation of business and user ids
user_reviews_num = np.array([len(x) for x in u_text.values()])
user_reviews_num_max = np.sort(user_reviews_num)[int(0.9 * len(user_reviews_num)) - 1]
user_reviews_len = np.array([len(j) for i in u_text.values() for j in i])
user_reviews_len_max = np.sort(user_reviews_len)[int(0.9 * len(user_reviews_len)) - 1]

business_reviews_num = np.array([len(x) for x in b_text.values()])
business_reviews_num_max = np.sort(business_reviews_num)[int(0.9 * len(business_reviews_num)) - 1]
business_reviews_len = np.array([len(j) for i in b_text.values() for j in i])
business_reviews_len_max = np.sort(business_reviews_len)[int(0.9 * len(business_reviews_len)) - 1]

user_num = max(u_text.keys())
business_num = max(b_text.keys())

print('user_reviews_num_max: ',user_reviews_num_max)
print('user_reviews_len_max: ',user_reviews_len_max)
print('business_reviews_num_max: ',business_reviews_num_max)
print('business_reviews_len_max: ',business_reviews_len_max)


In [0]:
# To pad reviews if they are smaller than max length and cut reviews if they are bigger than max length

def pad_sentences(text,max_reviews_num, max_review_len):

  u_text2 = {}
  padding_word = "<PAD/>"
  for i in text.keys():
      u_reviews = text[i]
      padded_u_train = []
      for ri in range(max_reviews_num):
          if ri < len(u_reviews):
              sentence = u_reviews[ri]
              if max_review_len > len(sentence):
                  num_padding = max_review_len - len(sentence)
                  # print("i:",i)
                  # print("sentence:", sentence)
                  # print("padding_word:", padding_word)
                  # print("num_padding:", num_padding)
                  new_sentence = sentence + [padding_word] * num_padding
                  padded_u_train.append(new_sentence)
              else:
                  new_sentence = sentence[:max_review_len]
                  padded_u_train.append(new_sentence)
          else:
              new_sentence = [padding_word] * max_review_len
              padded_u_train.append(new_sentence)
      u_text2[i] = padded_u_train

  return u_text2



In [0]:
# To pad collation of ids if they are smaller than max length and cut reviews if they are bigger than max length

def pad_reviewid(u_train, u_valid, u_len, num):
    pad_u_train = []

    for i in range(len(u_train)):
        x = u_train[i]
        while u_len > len(x):
            x.append(num)
        if u_len < len(x):
            x = x[:u_len]
        pad_u_train.append(x)
    pad_u_valid = []

    for i in range(len(u_valid)):
        x = u_valid[i]
        while u_len > len(x):
            x.append(num)
        if u_len < len(x):
            x = x[:u_len]
        pad_u_valid.append(x)
    return pad_u_train, pad_u_valid

In [0]:
# Padding for users
u_text1 = pad_sentences(u_text, user_reviews_num_max, user_reviews_len_max)
reid_user_train, reid_user_valid = pad_reviewid(user_business_id_train, user_business_id_valid, user_reviews_num_max, business_num + 1)

In [0]:
# Padding for business
b_text1 = pad_sentences(b_text, business_reviews_num_max, business_reviews_len_max)
reid_business_train, reid_business_valid = pad_reviewid(business_user_id_train, business_user_id_valid, business_reviews_num_max, user_num + 1)

In [0]:
user_voc = [xx for x in u_text1.values() for xx in x]
business_voc = [xx for x in b_text1.values() for xx in x]

In [0]:
# Building vocanulary and index of the vocab
def build_vocab(sentences1, sentences2):
    # Build vocabulary
    word_counts1 = Counter(itertools.chain(*sentences1))
    # Mapping from index to word
    vocabulary_inv1 = [x[0] for x in word_counts1.most_common()]
    vocabulary_inv1 = list(sorted(vocabulary_inv1))
    # Mapping from word to index
    vocabulary1 = {x: i for i, x in enumerate(vocabulary_inv1)}

    word_counts2 = Counter(itertools.chain(*sentences2))
    # Mapping from index to word
    vocabulary_inv2 = [x[0] for x in word_counts2.most_common()]
    vocabulary_inv2 = list(sorted(vocabulary_inv2))
    # Mapping from word to index
    vocabulary2 = {x: i for i, x in enumerate(vocabulary_inv2)}
    return [vocabulary1, vocabulary_inv1, vocabulary2, vocabulary_inv2]

In [0]:
# Building vocanulary and index of the vocab
vocabulary_user, vocabulary_inv_user, vocabulary_business, vocabulary_inv_business = build_vocab(user_voc, business_voc)

In [0]:
del user_voc, business_voc

In [0]:
del u_text, b_text

In [0]:
# Building data for input
def build_input_data(u_text, i_text, vocabulary_u, vocabulary_i):
    
    l = len(u_text)
    u_text2 = {}
    for i in u_text.keys():
        u_reviews = u_text[i]
        u = np.array([[vocabulary_u[word] for word in words] for words in u_reviews])
        u_text2[i] = u
    l = len(i_text)
    i_text2 = {}
    for j in i_text.keys():
        i_reviews = i_text[j]
        i = np.array([[vocabulary_i[word] for word in words] for words in i_reviews])
        i_text2[j] = i
    return u_text2, i_text2

In [0]:
u_text, b_text = build_input_data(u_text1, b_text1, vocabulary_user, vocabulary_business)

In [0]:
del u_text1, b_text1

In [0]:
y_train = np.array(y_train)
y_valid = np.array(y_valid)

uid_train = np.array(user_id_train)
uid_valid = np.array(user_id_valid)

iid_train = np.array(business_id_train)
iid_valid = np.array(business_id_valid)

reid_user_train = np.array(reid_user_train)
reid_user_valid = np.array(reid_user_valid)

reid_item_train = np.array(reid_business_train)
reid_item_valid = np.array(reid_business_valid)


In [0]:
#Shuffling the data
np.random.seed(42)

shuffle_indices = np.random.permutation(np.arange(len(y_train)))

userid_train = uid_train[shuffle_indices]
itemid_train = iid_train[shuffle_indices]
y_train = y_train[shuffle_indices]
reid_user_train = reid_user_train[shuffle_indices]
reid_item_train = reid_item_train[shuffle_indices]

y_train = y_train[:, np.newaxis]
y_valid = y_valid[:, np.newaxis]


userid_train = userid_train[:, np.newaxis]
itemid_train = itemid_train[:, np.newaxis]
userid_valid = uid_valid[:, np.newaxis]
itemid_valid = iid_valid[:, np.newaxis]

In [0]:
batches_train = list(zip(userid_train, itemid_train, reid_user_train, reid_item_train, y_train))
batches_valid = list(zip(userid_valid, itemid_valid, reid_user_valid, reid_item_valid, y_valid))


with open('/content/drive/My Drive/yelp_NARRE_train_filtered.pkl', 'wb') as f:
  pickle.dump(batches_train, f)

with open('/content/drive/My Drive/yelp_NARRE_valid_filtered.pkl', 'wb') as f:
  pickle.dump(batches_valid, f)



In [0]:
#Storing parameters required
para = {}
para['user_num'] = user_num
para['item_num'] = business_num
para['review_num_u'] = u_text[1].shape[0]
para['review_num_i'] = b_text[1].shape[0]
para['review_len_u'] = u_text[1].shape[1]
para['review_len_i'] = b_text[1].shape[1]
para['user_vocab'] = vocabulary_user
para['item_vocab'] = vocabulary_business
para['train_length'] = len(y_train)
para['test_length'] = len(y_valid)
para['u_text'] = u_text
para['i_text'] = b_text

In [0]:
with open('/content/drive/My Drive/NARRE_para_filtered.pkl', 'wb') as f:
  pickle.dump(para,f)