# Fit the CB models based on optimized parameters

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import os
import re, string, unicodedata
import nltk
import contractions
import inflect

import lime
from lime import lime_text
from lime.lime_text import LimeTextExplainer

import re

from wordcloud import WordCloud, STOPWORDS

import nltk
# import contractions
import inflect
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk import SnowballStemmer
from nltk.stem import LancasterStemmer, WordNetLemmatizer


from sklearn import metrics
from sklearn.model_selection import train_test_split
# from keras.preprocessing.sequence import pad_sequences

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import StandardScaler
%matplotlib inline
from sklearn.preprocessing import OneHotEncoder

import scipy

from gensim import corpora
from gensim import corpora
from gensim.similarities.docsim import Similarity
from gensim import corpora, models, similarities

import pickle
import time
import Utils as util

## Load all functions

In [2]:
### Text Prepocessing
def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
            
    return new_words

def remove_special(words):
    """Remove special signs like &*"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[-,$()#+&*]', '', word)
        if new_word != '':
            new_words.append(new_word)
            
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
            
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""  
    stopwords = nltk.corpus.stopwords.words('english')
    myStopWords = []
    stopwords.extend(myStopWords)
    new_words = []
    for word in words:
        if word not in stopwords:
            new_words.append(word)
            
    return new_words

def to_lowercase(words):
    """Convert words to lowercase"""
    new_words=[]
    for word in words:
        new_words.append(word.lower())
        
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stemmer = SnowballStemmer('english')
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
        
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
        
    return lemmas

def normalize_lemmatize(words):
    words = remove_special(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    words = stem_words(words)
    words = lemmatize_verbs(words)
    return words

def get_processed(sample):
    processed = pd.DataFrame(data=[],columns = ['business_id', 'text'])
    new_texts = []

    for i in range(0, len(sample)):
        business_id = sample['business_id'].iloc[i]
        words = nltk.word_tokenize(sample['text'].iloc[i])
        text = ' '.join(normalize_lemmatize(words))
        dfnew = pd.DataFrame([[business_id, text]], columns=['business_id', 'text'])
        new_texts.append(text)
        processed = processed.append(dfnew,ignore_index = True)
    return processed

## Similarity matrix

def get_tfidf_matrix(processed):
    '''
    get the Tf-Idf matrix of processed texts for business reviews
    
    '''
    TV = TfidfVectorizer(stop_words = "english")
    processed["text"] = processed["text"].fillna('')
    tfidf_matrix = TV.fit_transform((processed["text"]))
    
    return tfidf_matrix

def get_cos_sim_matrix(tfidf_matrix, n):
    '''
    use truncated SVD to reduce dimensions to n 
    @n: the dimensions to keep
    '''
    SVD = TruncatedSVD(n_components = n , random_state = 42) # 42 is the ultimate answer to everything
    tfidf_truncated = SVD.fit_transform(tfidf_matrix)
    cosine_sim = cosine_similarity(tfidf_truncated, tfidf_truncated)
    
    return cosine_sim

def get_euclidean_sim(business, n_components):
    
    SVD = TruncatedSVD(n_components = n_components , random_state = 42) # 42 is the ultimate answer to everything
    bus_truncated = SVD.fit_transform(business)
    
    eucl_dist = euclidean_distances(bus_truncated)
    eucl_sim = 1/np.exp(eucl_dist)
    return eucl_sim
    
def get_buscosine_sim(business,n_components):
    SVD = TruncatedSVD(n_components = n_components , random_state = 42) # 42 is the ultimate answer to everything
    bus_truncated = SVD.fit_transform(business)
    
    cosine_sim = cosine_similarity(bus_truncated, bus_truncated)
    return cosine_sim

def get_mix_sim_matrix(cosine_sim, bus_cos_sim, lmbda = 0.5, ):
    mixed_sim = np.add(cosine_sim*lmbda, bus_cos_sim*(1-lmbda)) # assume equally weighted

    return mixed_sim

def get_mix_sim_df(df_tfidf_sim, df_bus_sim, lmbda = 0.5):
    df_tfidf_pivot = pd.melt(df_tfidf_sim.reset_index(), id_vars = "index" , value_vars = df_tfidf_sim.columns.values)
    df_bus_pivot = pd.melt(df_bus_sim.reset_index(), id_vars = "index" , value_vars = df_bus_sim.columns.values)
    
    df_merge = pd.merge(df_tfidf_pivot, df_bus_pivot, on = ["index", "variable"])
    df_merge["value"] = (lmbda) * df_merge["value_x"] + (1-lmbda) * df_merge["value_y"]
    
    df_mixed_sim = pd.pivot(df_merge, index="index", columns="variable", values ="value")
    return df_mixed_sim
## Get recommendations and prediction
def get_recommendation_cos(reviews, business_id, user_id, df_sim, k):
    '''get the business_id_array that shows top_k greatest similarity to the specific business_id'''
    user_bids = reviews[reviews['user_id']==user_id]['business_id'].values
    df_user = df_sim.loc[df_sim.index.isin(user_bids), df_sim.columns == business_id]
    df_user_topk = df_user.sort_values(df_user.columns[0], ascending = False).iloc[:k]
    
    return np.array(df_user_topk.index.values)


def predict_rating(reviews, user_id, business_ids):
    '''predict the avg of the user's rating on business in business_ids'''
    scores = reviews.loc[(reviews.user_id == user_id) & (reviews.business_id.isin(business_ids))]["stars"].values
    
    return np.mean(scores)


def get_results_cos(reviews, reviews_test, business_id, user_id, df_sim, k):
    '''
    prediction on the business_id：avg the ratings on top_k business that shows similarity to the business_id
    actual on the business_id: the true rating 
    '''
    actual = reviews_test.loc[(reviews_test.user_id==user_id) & (reviews_test.business_id==business_id)]['stars'].values[0]
    business_ids = get_recommendation_cos(reviews, business_id, user_id, df_sim, k)
    prediction = predict_rating(reviews, user_id, business_ids)
    
    return actual, prediction

def get_review_processed(processed, reviews):
    reviews_processed = reviews.loc[reviews.business_id.isin(processed.business_id)]\
                                                       .reset_index()\
                                                       .drop(columns=['index'])
    return reviews_processed

def CB_predict(reviews, reviews_test, df_sim, k = 5):
    '''
    based on test_df 
    get a dataframe with each user on each business's true ratings and prediction ratings
    @df_sim, n*n DataFrame for business similarities
    @k: int, top k similar businesses
    '''
    user_id_sample = reviews_test['user_id'].values
    busi_id_sample = reviews_test['business_id'].values
    
    actual = []
    predictions = []
    
    for i in range(len(reviews_test)):
        try:
            act, pred = get_results_cos(reviews, reviews_test, busi_id_sample[i], user_id_sample[i], df_sim, k)
            actual.append(act)
            predictions.append(pred)
            
        except:
            actual.append(np.nan)
            predictions.append(np.nan)
            
    return pd.DataFrame({"user_id": user_id_sample,
                         "business_id": busi_id_sample,
                         "true_ratings": actual,
                         "prediction_ratings":  predictions                        
                        })

## LSI model
def get_lsi(processed, reviews, user_id, n_topics):
    '''
    get the lsi model for user_id
    '''
    user_bids = reviews[reviews['user_id']==user_id]['business_id'].values
    processed_user = processed.loc[processed.business_id.isin(user_bids)]
    documents = list(processed_user['text'].values)
    texts = [[word for word in document.split(' ')] for document in documents]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts] 

    lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=n_topics)
    
    return lsi, dictionary, corpus

def get_recommendation_lsi(processed, reviews, df_lsi, business_id, user_id, k, n_topics):
    lsi = df_lsi[(df_lsi["n_topic"] == n_topics) & (df_lsi["user_id"] == user_id)]["lsi"][0]
    dictionary = df_lsi[(df_lsi["n_topic"] == n_topics) & (df_lsi["user_id"] == user_id)]["dictionary"][0]
    user_bids = reviews[reviews['user_id']==user_id]['business_id'].values
    processed_user = processed.loc[processed.business_id.isin(user_bids)]
    documents = list(processed_user['text'].values)
    texts = [[word for word in document.split(' ')] for document in documents]
    corpus = [dictionary.doc2bow(text) for text in texts] 
    
    doc = processed['text'].loc[processed.business_id==business_id].values[0]
    vec_bow = dictionary.doc2bow(doc.lower().split())
    vec_lsi = lsi[vec_bow]
    index = similarities.MatrixSimilarity(lsi[corpus])
    sims = list(index[vec_lsi])
    results = list(zip(user_bids, sims))
    results_ordered = np.array(sorted(results, key=lambda x: x[1], reverse=True))
    results_topk = results_ordered[:k]
    
    return results_topk[:,0]

def get_results_lsi(processed,reviews,reviews_test, df_lsi ,business_id,user_id,k,n_topics):
    actual = reviews_test.loc[(reviews_test.user_id==user_id) & (reviews_test.business_id==business_id)]['stars'].values[0]
    business_ids = get_recommendation_lsi(processed,reviews,df_lsi ,business_id,user_id,k,n_topics)
    prediction = predict_rating(reviews, user_id, business_ids)
    return actual, prediction

def get_recommendation_lsi(processed,reviews,business_id,user_id,k,n_topics):
    user_bids = reviews[reviews['user_id']==user_id]['business_id'].values
    processed_user = processed.loc[processed.business_id.isin(user_bids)]
    documents = list(processed_user['text'].values)
    texts = [[word for word in document.split(' ')] for document in documents]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts] 

    lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=n_topics)
    doc = processed['text'].loc[processed.business_id==business_id].values[0]
    vec_bow = dictionary.doc2bow(doc.lower().split())
    vec_lsi = lsi[vec_bow]
    index = similarities.MatrixSimilarity(lsi[corpus])
    sims = list(index[vec_lsi])
    results = list(zip(user_bids, sims))
    results_ordered = np.array(sorted(results, key=lambda x: x[1], reverse=True))
    results_topk = results_ordered[:k]
    
    return results_topk[:,0]

def get_results_lsi(processed,reviews,reviews_test ,business_id,user_id,k,n_topics):
    actual = reviews_test.loc[(reviews_test.user_id==user_id) & (reviews_test.business_id==business_id)]['stars'].values[0]
    business_ids = get_recommendation_lsi(processed,reviews,business_id,user_id,k,n_topics)
    prediction = predict_rating(reviews, user_id, business_ids)
    return actual, prediction

def CB_LSI_predict(df_texts_train, reviews, reviews_test, k = 5, n_topics = 100):
    uid_sample = reviews_test['user_id'].values
    bid_sample = reviews_test['business_id'].values
    actual_lsi = []
    predictions_lsi = []
    for i in range(len(reviews_test)):
        try:
            act, pred = get_results_lsi(df_texts_train, reviews, reviews_test, bid_sample[i],uid_sample[i], k, n_topics)
            predictions_lsi.append(pred)
            actual_lsi.append(act)

        except:
            predictions_lsi.append(np.nan)
            actual_lsi.append(np.nan)
            
    return pd.DataFrame({"user_id": uid_sample,
                         "business_id": bid_sample,
                         "ratings": actual_lsi,
                         "pred_lsi":  predictions_lsi})

def get_recommendation_cos_full(reviews, user_id, df_sim, k, busi_id_lst):
    '''get the business_id_array that shows top_k greatest similarity to the specific business_id'''
    
    df_user_rating = reviews[reviews.user_id == user_id]
#     df_sim = df_sim.loc[busi_id_lst, busi_id_lst]
    user_bids = df_user_rating['business_id'].values
    df_user = df_sim.loc[df_sim.index.isin(user_bids)]
    df_user_rank = df_user.rank(ascending = False, axis = 0)
    df_user_rank[df_user_rank <= k] = 1
    df_user_rank[df_user_rank > k] = 0
    df_user_rank = df_user_rank/ np.min([k, len(user_bids)])
  
    user_rating_matrix = np.array(df_user_rating[["business_id", "stars"]].set_index(["business_id"]).loc[df_user_rank.index.values])
    pred = user_rating_matrix.T @ np.array(df_user_rank)
    return pred

def CB_sim_fit_full_matrix(train_valid_df, df_sim, k, user_id_lst, busi_id_lst):
    rating_pred_matrix = np.zeros((len(user_id_lst), len(busi_id_lst)))
    for i,user_id in enumerate(user_id_lst):
        rating_pred_matrix[i,] = get_recommendation_cos_full(train_valid_df, user_id, df_sim, k, busi_id_lst)
    return(rating_pred_matrix)

def get_mse(pred, actual):
    # Ignore zero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)


## Load Data

In [3]:
# cand_users = pickle.load(open('user_mapping', "rb"))
# # cand_rest = pickle.load(open('business_mapping', "rb"))
# list_user = list(cand_users.values())
# # list_rest = list(cand_rest.values())

In [4]:
## Read data
rev_busi = pd.read_csv('filtered_reviews.csv', parse_dates=["date"])

user_id_lst = rev_busi['user_id'].unique().tolist() # rows of sparse matrix
# user1 = set(user_id)
# user2 = set(list_user)
# user_id_lst = list(user1.intersection(user2))
busi_id_lst = rev_busi['business_id'].unique().tolist() # columns of sparse matrix

# train_valid_df  = pickle.load(open('../data/train_valid_df.pkl', "rb"))
test_df = pickle.load(open('test_df.pkl', "rb"))
train_df  = pickle.load(open('train_df.pkl', "rb"))
valid_df = pickle.load(open('valid_df.pkl', "rb"))
train_valid_df = pickle.load(open('train_valid_df.pkl', "rb"))

train_sparse_matrix = np.load('train_sparse_matrix.npy')
test_sparse_matrix = np.load('test_sparse_matrix.npy')
valid_sparse_matrix = np.load('valid_sparse_matrix.npy')
train_valid_sparse_matrix = np.load('train_valid_sparse_matrix.npy')

bus_df_subset = pd.read_csv("business_subset_cleaned.csv", index_col= "business_id")
bus_df_subset.head(1)

Unnamed: 0_level_0,latitude,longitude,stars,review_count,is_open,RestaurantsGoodForGroups,GoodForKids,RestaurantsReservations,RestaurantsTableService,HappyHour,...,Ambience_upscale,Parking,music,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MTSW4McQd7CbVtyjqoe9mw,0.637723,0.925908,0.382081,-0.469568,1,0,0,0,0,0,...,0,1,0,1,1,1,1,1,1,1


In [5]:
l = pd.DataFrame(user_id_lst)
l.to_csv('user_list.csv')

In [6]:
l2 = pd.DataFrame(busi_id_lst)
l2.to_csv('rest_list.csv')

In [7]:
l2.shape

(4312, 1)

## 1. non-NLP CB Model (CB_Bus)

In [8]:
k1 = 15
similarity = "cos" # or "eucl"
n_components1 = 10

if similarity == "cos":
    bus_sim = get_buscosine_sim(bus_df_subset, n_components1)
else:
    bus_sim = get_euclidean_sim(bus_df_subset, n_components1)
    
df_bus_sim  = pd.DataFrame(bus_sim, index= bus_df_subset.index.values, columns=bus_df_subset.index.values)
df_bus_sim.shape


(4312, 4312)

In [9]:
rev_busi.head(2)

Unnamed: 0.1,Unnamed: 0,review_id,user_id,business_id,stars,text,date,name,city,state,category_1,category_2,category_3
0,2,S6pQZQocMB1WHMjTRbt77A,ejFxLGqQcWNLdNByJlIhnQ,XQfwVwDr-v0ZS3_CbbE5Xw,4.0,The place is cute and the staff was very frien...,2017-08-08 00:58:18,Turning Point of North Wales,North Wales,PA,Restaurants,Breakfast & Brunch,Food
1,3,WqgTKVqWVHDHjnjEsBvUgg,f7xa0p_1V9lx53iIGN5Sug,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,We came on a Saturday morning after waiting a ...,2017-11-19 02:20:23,Turning Point of North Wales,North Wales,PA,Restaurants,Breakfast & Brunch,Food


In [10]:
def get_recommendation_cos_full(reviews, user_id, df_sim, k, busi_id_lst):
    '''get the business_id_array that shows top_k greatest similarity to the specific business_id'''
    
    df_user_rating = reviews[reviews.user_id == user_id]
#     df_sim = df_sim.loc[busi_id_lst, busi_id_lst]
    user_bids = df_user_rating['business_id'].values
    df_user = df_sim.loc[df_sim.index.isin(user_bids)]
    df_user_rank = df_user.rank(ascending = False, axis = 0)
    df_user_rank[df_user_rank <= k] = 1
    df_user_rank[df_user_rank > k] = 0
    df_user_rank = df_user_rank/ np.min([k, len(user_bids)])
  
    user_rating_matrix = np.array(df_user_rating[["business_id", "stars"]].set_index(["business_id"]).loc[df_user_rank.index.values])
    pred = user_rating_matrix.T @ np.array(df_user_rank)
    return pred

def CB_sim_fit_full_matrix(train_valid_df, df_sim, k, user_id_lst, busi_id_lst):
    rating_pred_matrix = np.zeros((len(user_id_lst), len(busi_id_lst)))
    for i,user_id in enumerate(user_id_lst):
        rating_pred_matrix[i,] = get_recommendation_cos_full(train_valid_df, user_id, df_sim, k, busi_id_lst)
    return(rating_pred_matrix)

def get_mse(pred, actual):
    # Ignore zero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

In [11]:
#### Generate model fit:
t0 = time.time()
pred_matrix = CB_sim_fit_full_matrix(train_df, df_bus_sim.loc[busi_id_lst, busi_id_lst],k1, user_id_lst, busi_id_lst)
t1 = time.time()
print("time elapsed: (seconds)")
print(np.round(t1 - t0,3))

print("MSE on test set:", get_mse(pred_matrix, valid_sparse_matrix))

time elapsed: (seconds)
1040.896
MSE on test set: 1.429582187631011


In [12]:
# np.save('pred_matrix.npy',     pred_matrix)
# print("MSE on test set:", get_mse(pred_matrix, valid_sparse_matrix))

In [13]:
b = get_mse(pred_matrix, valid_sparse_matrix)
b

1.429582187631011

In [14]:
pred_matrix

array([[3.8       , 3.93333333, 3.8       , ..., 3.73333333, 3.53333333,
        3.8       ],
       [3.66666667, 3.8       , 3.6       , ..., 3.86666667, 3.66666667,
        3.6       ],
       [3.85714286, 3.85714286, 3.85714286, ..., 3.85714286, 3.85714286,
        3.85714286],
       ...,
       [2.        , 2.        , 2.        , ..., 2.        , 2.        ,
        2.        ],
       [4.        , 4.        , 4.        , ..., 4.        , 4.        ,
        4.        ],
       [4.        , 4.        , 4.        , ..., 4.        , 4.        ,
        4.        ]])

In [15]:
pred_matrix.shape

(25420, 4312)

## 2. CB - similarity with NLP (CB_NLP)

In [16]:
# df_texts = pickle.load(open('../data/text_train_valid_df.pkl', "rb"))

df_texts = pickle.load(open('text_train_df.pkl', "rb"))


df_texts.head(3)



Unnamed: 0,business_id,text
0,-0G_6-KFGpCpxTUlVXCMYQ,taco place good get straightfrommexico varieti...
1,-1oygVebK81K8JEPI6H6Lw,awesom find good soup sammi bagel downtown sur...
2,-2VYztMXVorktljCdQNPmQ,amaz indian food great flavor alway come chick...


In [17]:
# Parameters 
n_components2 = 60 #n singular values for reviews' vector space
k2 = 45

In [18]:
n_components2 = 60 #n singular values for reviews' vector space
k2 = 45
tfidf_matrix = get_tfidf_matrix(df_texts)
cosine_sim = get_cos_sim_matrix(tfidf_matrix, n_components2)
df_tfidf_sim = pd.DataFrame(cosine_sim, index=df_texts['business_id'].values, columns=df_texts['business_id'].values)
df_tfidf_sim.head(1)

Unnamed: 0,-0G_6-KFGpCpxTUlVXCMYQ,-1oygVebK81K8JEPI6H6Lw,-2VYztMXVorktljCdQNPmQ,-3Mc8R5c23FxrxlJn3ivww,-3xX_IfttKjPJ792BOBJ-Q,-4x3pVUUsfWmKEilWKsOZQ,-8luB5pJ7d9UOoiF7wikkw,-AWclhh1_2VnqPylPgBU3g,-Dg2pJigtydnPUUW5XBamQ,-DsC3cvCBEy_FNJ7U-i9zg,...,zl-LqKy7snBF4LKmiVKmNA,zm9aNqkHHtp1f-VT3ZlOZw,zmpRwOqxaajeh2YvAKxLaQ,znlTHyGTSWyqZFYuS0KpmQ,zobY3ws9R2pgbcP0gOImcg,ztnYPn2w0R4lEJL266apzg,ztppjLmFE25wZOQdJKXAuA,zvGNZF827KyzLupKiG4Xtw,zwGvMQ7Nw0VwqLUYDuxRXQ,zz0l4dUf28wzPAaTdGqsSw
-0G_6-KFGpCpxTUlVXCMYQ,1.0,0.146432,0.254958,0.303581,0.372595,0.304384,0.257288,0.257924,0.424976,0.360424,...,0.174277,0.340718,0.361616,0.350934,0.32853,0.357429,0.250153,0.38829,0.511297,0.962684


In [19]:
#### Generate model fit:
t0 = time.time()
pred_matrix_nlp = CB_sim_fit_full_matrix(train_df, df_tfidf_sim.loc[busi_id_lst, busi_id_lst],k1, user_id_lst, busi_id_lst)
t1 = time.time()
print("time elapsed: (seconds)")
print(np.round(t1 - t0,3))

# print("MSE on test set:", get_mse(pred_matrix_nlp, valid_sparse_matrix))


time elapsed: (seconds)
963.432


In [20]:
a = get_mse(pred_matrix_nlp, valid_sparse_matrix)
a

1.435387685657315

In [21]:
pred_matrix_nlp

array([[3.8       , 3.66666667, 3.86666667, ..., 3.66666667, 3.86666667,
        3.8       ],
       [3.8       , 3.66666667, 3.86666667, ..., 3.66666667, 3.66666667,
        3.8       ],
       [3.85714286, 3.85714286, 3.85714286, ..., 3.85714286, 3.85714286,
        3.85714286],
       ...,
       [2.        , 2.        , 2.        , ..., 2.        , 2.        ,
        2.        ],
       [4.        , 4.        , 4.        , ..., 4.        , 4.        ,
        4.        ],
       [4.        , 4.        , 4.        , ..., 4.        , 4.        ,
        4.        ]])

In [22]:
np.save('pred_matrix_nlp.npy',     pred_matrix_nlp)
# print("MSE on test set:", get_mse(pred_matrix_nlp, valid_sparse_matrix))

In [23]:
pred_matrix_nlp.shape

(25420, 4312)

In [24]:
final_matrix = pred_matrix*(1/b) + pred_matrix_nlp*(1/a)
final_matrix

array([[5.3054877 , 5.30586492, 5.35193276, ..., 5.16596391, 5.16539807,
        5.3054877 ],
       [5.21222035, 5.21259758, 5.21203174, ..., 5.25923125, 5.11933023,
        5.16558668],
       [5.38526947, 5.38526947, 5.38526947, ..., 5.38526947, 5.38526947,
        5.38526947],
       ...,
       [2.79236195, 2.79236195, 2.79236195, ..., 2.79236195, 2.79236195,
        2.79236195],
       [5.58472389, 5.58472389, 5.58472389, ..., 5.58472389, 5.58472389,
        5.58472389],
       [5.58472389, 5.58472389, 5.58472389, ..., 5.58472389, 5.58472389,
        5.58472389]])

In [25]:
np.save('final_cb_matrix.npy', final_matrix)

In [26]:
import torch
recom_matrix = torch.tensor(final_matrix)
top_values, top_idx = torch.topk(recom_matrix, 20, 1)

In [27]:
top_idx
torch.save(top_idx, 'cb_recom.pt')

tensor([[3657, 1002,    5,  ..., 2699, 2712,  257],
        [ 635, 2465,   26,  ..., 3649,  362,  770],
        [  18,   10,    4,  ...,    2,    6,   14],
        ...,
        [  18,   10,    4,  ...,    2,    6,   14],
        [  18,   10,    4,  ...,    2,    6,   14],
        [  18,   10,    4,  ...,    2,    6,   14]])

In [28]:
for i, li in enumerate(top_idx):
    lis = li.tolist()
    print('User:{}, Top 20 recommendations: {}...'.format(i,lis[:5]))

User:0, Top 20 recommendations: [3657, 1002, 5, 503, 2339]...
User:1, Top 20 recommendations: [635, 2465, 26, 554, 923]...
User:2, Top 20 recommendations: [18, 10, 4, 16, 12]...
User:3, Top 20 recommendations: [18, 10, 4, 16, 12]...
User:4, Top 20 recommendations: [18, 10, 4, 16, 12]...
User:5, Top 20 recommendations: [18, 10, 4, 16, 12]...
User:6, Top 20 recommendations: [18, 10, 4, 16, 12]...
User:7, Top 20 recommendations: [18, 10, 4, 16, 12]...
User:8, Top 20 recommendations: [18, 10, 4, 16, 12]...
User:9, Top 20 recommendations: [93, 2382, 2761, 4052, 3491]...
User:10, Top 20 recommendations: [18, 10, 4, 16, 12]...
User:11, Top 20 recommendations: [18, 10, 4, 16, 12]...
User:12, Top 20 recommendations: [2195, 528, 1641, 520, 958]...
User:13, Top 20 recommendations: [18, 10, 4, 16, 12]...
User:14, Top 20 recommendations: [18, 10, 4, 16, 12]...
User:15, Top 20 recommendations: [18, 10, 4, 16, 12]...
User:16, Top 20 recommendations: [18, 10, 4, 16, 12]...
User:17, Top 20 recommendati

In [29]:
# top_idx_list = top_idx.tolist()
# top_idx_list.shape

## 4. LSI Model

In [30]:
k4 = 50
n_topics = 50
df_texts = pickle.load(open('text_train_valid_df.pkl', "rb"))

In [31]:
#### Generate model fit:
t0 = time.time()
df_pred_lsi = util.CB_LSI_predict(df_texts,train_valid_df, test_df, k = k4, n_topics = n_topics)
t1 = time.time()
# print("time elapsed: (seconds)")
# print(np.round(t1 - t0,3))


In [32]:
# print("MSE on test set:", mean_squared_error(df_pred_lsi.pred_lsi, df_pred_lsi.ratings ))

In [33]:
# df_pred_lsi