In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder, RobustScaler, StandardScaler, MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error 
import tensorflow.compat.v1 as tf
from lightfm.data import Dataset
from lightfm import LightFM
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k, reciprocal_rank
from lightfm import cross_validation
import csv
import pickle
from scipy import sparse
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity


In [None]:
def get_data():
    return csv.DictReader(
            (x for x in open('Data/Cleaned_Data/book_user_explicit_rating_cleaned.csv','r'))
        )

In [None]:
dataset = Dataset()

In [None]:
dataset.fit(users=(x['User_ID'] for x in get_data()),
            items=(x['Unique_ISBN'] for x in get_data()),
            item_features=(x['Book_Author'] for x in get_data()),
            user_features=(x['Age_Range'] for x in get_data())
            )

In [None]:
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

In [None]:
(interactions, weights) = dataset.build_interactions(((x['User_ID'], x['Unique_ISBN'])
                                                      for x in get_data()))

In [None]:
print(repr(interactions))

In [None]:
item_features = dataset.build_item_features(((x['Unique_ISBN'], [x['Book_Author']])
                                              for x in get_data()))
print(repr(item_features))

In [None]:
user_features = dataset.build_user_features(((x['User_ID'], [x['Age_Range']])
                                              for x in get_data()))
print(repr(user_features))

In [None]:
train_interactions, test_interactions = cross_validation.random_train_test_split(interactions, random_state=np.random.RandomState(seed=11232))

train_weights, test_weights = cross_validation.random_train_test_split(weights, random_state=np.random.RandomState(seed=11232))

In [None]:
import itertools

def sample_hyperparameters():
    """
    Yield possible hyperparameter choices.
    """

    while True:
        yield {
            "no_components": np.random.randint(16, 64),
            "loss": np.random.choice(["warp"]),
            "learning_rate": np.random.exponential(0.05),
            "num_epochs": np.random.randint(5, 50),
            "random_state":np.random.RandomState(seed=11232)
        }


def random_search(train, test, num_samples=10):
    """
    Sample random hyperparameters, fit a LightFM model, and evaluate it
    on the test set.

    Parameters
    ----------

    train: np.float32 coo_matrix of shape [n_users, n_items]
        Training data.
    test: np.float32 coo_matrix of shape [n_users, n_items]
        Test data.
    num_samples: int, optional
        Number of hyperparameter choices to evaluate.


    Returns
    -------

    generator of (auc_score, hyperparameter dict, fitted model)

    """

    for hyperparams in itertools.islice(sample_hyperparameters(), num_samples):
        num_epochs = hyperparams.pop("num_epochs")

        model_tune = LightFM(**hyperparams)
        model_tune.fit(interactions=train, epochs=num_epochs, item_features=item_features, user_features=user_features,sample_weight=train_weights)

        score = auc_score(model_tune, train, item_features=item_features, user_features=user_features).mean()
        auc_test = auc_score(model_tune, test, item_features=item_features, user_features=user_features).mean()

        hyperparams["num_epochs"] = num_epochs

        yield (score, auc_test, hyperparams, model_tune)

In [None]:
(score, auc_test, hyperparams, model_tune) = max(random_search(train_interactions, test_interactions), key=lambda x: x[0])

print("Best score {} at {}".format(score, hyperparams))
print("Best test {} at {}".format(auc_test, hyperparams))

In [None]:
best_model = LightFM(loss='warp',learning_rate=0.07855246734493881,no_components=30)

In [None]:
best_model.fit(
    interactions=train_interactions,
    item_features=item_features,
    user_features=user_features, sample_weight=train_weights,
    epochs=5, verbose=True)

In [None]:
auc_train = auc_score( 
        best_model, train_interactions, 
        item_features=item_features, 
        user_features=user_features).mean()
auc_test = auc_score( 
        best_model, test_interactions, 
        item_features=item_features, 
        user_features=user_features).mean()
print(auc_train)
print(auc_test)

In [None]:
score = auc_score( 
        best_model, interactions, 
        item_features=item_features, 
        user_features=user_features).mean()
print(score)

In [None]:
train_precision = precision_at_k(best_model, train_interactions, k=10,item_features=item_features,user_features=user_features).mean()
test_precision = precision_at_k(best_model, test_interactions, k=10,item_features=item_features,user_features=user_features).mean()

print(train_precision)
print(test_precision)

# Save and Load model

In [None]:
with open('Hybrid.pickle', 'wb') as fle:
    pickle.dump(best_model, fle, protocol=pickle.HIGHEST_PROTOCOL)

In [2]:
with open('Hybrid.pickle', 'rb') as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    model_from_pickle = pickle.load(f)

In [None]:
score_from_pkl = auc_score( 
        model_from_pickle, interactions, 
        item_features=item_features, 
        user_features=user_features).mean()
print(score_from_pkl)

# Apply

In [3]:
df = pd.read_csv('Data/Cleaned_Data/book_user_explicit_rating_cleaned.csv',encoding='UTF-8')
df = df.drop(columns=['index'])
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df = df.drop(columns=['Image_URL','ISBN'])
df = df.sample(n=8000, replace=False, random_state=1)

In [4]:
# Function to create an interaction matrix dataframe from transactional type interactions
interactions_mtx = df.groupby(['User_ID', 'Unique_ISBN'])['Book_Rating'].sum().unstack().reset_index().fillna(0).set_index('User_ID')
    
interactions_mtx.head()
interactions_mtx.shape

(4613, 4858)

In [5]:
_books = df.drop(columns=['User_ID','Age','Age_Range','Country'])

In [6]:
# Function to create a user dictionary based on their index and number in interaction dataset
user_id = list(interactions_mtx.index)
user_dict = {}
counter = 0 
for i in user_id:
    user_dict[i] = counter
    counter += 1

In [7]:
# Function to create an item dictionary based on their item_id and item name
_books = _books.reset_index()
item_dict ={}
for i in range(_books.shape[0]):
    item_dict[(_books.loc[i,'Unique_ISBN'])] = _books.loc[i,'Book_Title']

In [31]:
# Function to run matrix-factorization algorithm
x = sparse.csr_matrix(interactions_mtx.values)
model_new = LightFM(loss='warp',learning_rate=0.07855246734493881,no_components=30)
model_new.fit(x,epochs=100)

<lightfm.lightfm.LightFM at 0x7fabbbaf6b80>

In [None]:
# print('Train precision at k={}:\t{:.4f}'.format(10, precision_at_k(model_new, x, k=10).mean()))

In [8]:
def sample_recommendation_user(model, interactions, user_id, user_dict, 
                               item_dict,threshold = 0,nrec_items = 10, show = True):
    '''
    Function to produce user recommendations
    Required Input - 
        - model = Trained matrix factorization model
        - interactions = dataset used for training the model
        - user_id = user ID for which we need to generate recommendation
        - user_dict = Dictionary type input containing interaction_index as key and user_id as value
        - item_dict = Dictionary type input containing item_id as key and item_name as value
        - threshold = value above which the rating is favorable in new interaction matrix
        - nrec_items = Number of output recommendation needed
    Expected Output - 
        - Prints list of items the given user has already bought
        - Prints list of N recommended items  which user hopefully will be interested in
    '''
    n_users, n_items = interactions.shape
    user_x = user_dict[user_id]
    scores = pd.Series(model.predict(user_x,np.arange(n_items)))
    scores.index = interactions.columns
    scores = list(pd.Series(scores.sort_values(ascending=False).index))
    
    known_items = list(pd.Series(interactions.loc[user_id,:] \
                                 [interactions.loc[user_id,:] > threshold].index) \
                                .sort_values(ascending=False))
    #print(known_items)
    
    scores = [x for x in scores if x not in known_items]
    return_score_list = scores[0:nrec_items]
    known_items = list(pd.Series(known_items).apply(lambda x: item_dict[x]))
    scores = list(pd.Series(return_score_list).apply(lambda x: item_dict[x]))
    if show == True:
        print("Known Likes:")
        counter = 1
        for i in known_items:
            #print(i)
            print(str(counter) + '- ' + i)
            counter+=1

        print("\n Recommended Items:")
        counter = 1
        for i in scores:
            #print(i)
            print(str(counter) + '- ' + i)
            counter+=1
    return return_score_list

In [27]:
rec_list = sample_recommendation_user(model = model_from_pickle, 
                                      interactions = interactions_mtx, 
                                      user_id = 1733, 
                                      user_dict = user_dict,
                                      item_dict = item_dict, 
                                      threshold = 4,
                                      nrec_items = 10,
                                      show = True)

Known Likes:
1- SUDDENLY
2- WHO GETS TO MARRY MAX? (HARLEQUIN AMERICAN ROMANCE, NO. 843)
3- FAKING IT

 Recommended Items:
1- MARY, BLOODY MARY: A YOUNG ROYALS BOOK
2- SILENCE OF THE LAMBS
3- THE WITCHING HOUR (LIVES OF THE MAYFAIR WITCHES)
4- AND THEN THERE WERE NONE : A NOVEL
5- 253: THE PRINT REMIX
6- WE WERE SOLDIERS ONCE... AND YOUNG: IA DRANG--THE BATTLE THAT CHANGED THE WAR IN VIETNAM
7- SAILING ALONE AROUND THE ROOM: NEW AND SELECTED POEMS
8- PYGMALION : A ROMANCE IN FIVE ACTS
9- THE FATAL SHORE
10- THE HOT ZONE


In [10]:
def sample_recommendation_item(model,interactions,item_id,user_dict,item_dict,number_of_user):
    '''
    Funnction to produce a list of top N interested users for a given item
    Required Input -
        - model = Trained matrix factorization model
        - interactions = dataset used for training the model
        - item_id = item ID for which we need to generate recommended users
        - user_dict =  Dictionary type input containing interaction_index as key and user_id as value
        - item_dict = Dictionary type input containing item_id as key and item_name as value
        - number_of_user = Number of users needed as an output
    Expected Output -
        - user_list = List of recommended users 
    '''
    n_users, n_items = interactions.shape
    x = np.array(interactions.columns)
    scores = pd.Series(model.predict(np.arange(n_users), np.repeat(x.searchsorted(item_id),n_users)))
    user_list = list(interactions.index[scores.sort_values(ascending=False).head(number_of_user).index])
    return user_list

In [28]:
sample_recommendation_item(model = model_from_pickle,
                           interactions = interactions_mtx,
                           item_id = '0385504209',
                           user_dict = user_dict,
                           item_dict = item_dict,
                           number_of_user = 10)

[38176, 214903, 24960, 179444, 132930, 130215, 182154, 33818, 16876, 123433]

In [32]:
def create_item_emdedding_distance_matrix(model,interactions):
    '''
    Function to create item-item distance embedding matrix
    Required Input -
        - model = Trained matrix factorization model
        - interactions = dataset used for training the model
    Expected Output -
        - item_emdedding_distance_matrix = Pandas dataframe containing cosine distance matrix b/w items
    '''
    df_item_norm_sparse = sparse.csr_matrix(model.item_embeddings)
    similarities = cosine_similarity(df_item_norm_sparse)
    item_emdedding_distance_matrix = pd.DataFrame(similarities)
    item_emdedding_distance_matrix.columns = interactions.columns
    item_emdedding_distance_matrix.index = interactions.columns
    return item_emdedding_distance_matrix

## Creating item-item distance matrix
item_item_dist = create_item_emdedding_distance_matrix(model = model_new,
                                                       interactions = interactions_mtx)
## Checking item embedding distance matrix
#item_item_dist.head()

In [35]:
def item_item_recommendation(item_emdedding_distance_matrix, item_id, 
                             item_dict, n_items = 10, show = True):
    '''
    Function to create item-item recommendation
    Required Input - 
        - item_emdedding_distance_matrix = Pandas dataframe containing cosine distance matrix b/w items
        - item_id  = item ID for which we need to generate recommended items
        - item_dict = Dictionary type input containing item_id as key and item_name as value
        - n_items = Number of items needed as an output
    Expected Output -
        - recommended_items = List of recommended items
    '''
    recommended_items = list(pd.Series(item_emdedding_distance_matrix.loc[item_id,:]. \
                                  sort_values(ascending = False).head(n_items+1). \
                                  index[1:n_items+1]))
    if show == True:
        print("Item of interest :{0}".format(item_dict[item_id]))
        print("Item similar to the above item:")
        counter = 1
        for i in recommended_items:
            print(str(counter) + '- ' +  item_dict[i])
            counter+=1
    return recommended_items

## Calling 5 recommended items for item id 
rec_list = item_item_recommendation(item_emdedding_distance_matrix = item_item_dist,
                                    item_id = '0385504209',
                                    item_dict = item_dict,
                                    n_items = 5)

Item of interest :THE DA VINCI CODE
Item similar to the above item:
1- HUSH MONEY (SPENSER MYSTERIES)
2- BALL FOUR
3- THE SEVENTH COMMANDMENT
4- THE GREAT SANTINI
5- THE TWENTIETH WIFE: A NOVEL
