In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder, RobustScaler, StandardScaler, MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error 
import tensorflow.compat.v1 as tf
from lightfm.data import Dataset
from lightfm import LightFM
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k, reciprocal_rank
from lightfm import cross_validation
import csv
import pickle
from scipy import sparse
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity




# About the LightFM library

[LightFM](https://making.lyst.com/lightfm/docs/index.html) is a Python implementation of a number of popular recommendation algorithms for both implicit and explicit feedback. It also makes it possible to incorporate both item and user metadata into the traditional matrix factorization algorithms. It represents each user and item as the sum of the latent representations of their features, thus allowing recommendations to generalise to new items (via item features) and to new users (via user features).

# Data retrieving

Since the LightFM requires their own Object Class, we would need to retrive the data in the form of DictReader

In [2]:
def get_data():
    return csv.DictReader(
            (x for x in open('Data/Cleaned_Data/book_user_explicit_rating_cleaned.csv','r'))
        )

In [3]:
dataset = Dataset()

# Data preparation

We will prepare our interactions matrix with LightFM library

Thanks to the flexibility of LightFM, it will automatically become a Hybrid recommender algorithm if we includes item or user features. On the otherhand, if we do not specify item features or user features, it will use a basic CF matrix factorizatioon algorithm.

In this project, we will include the item feature `Book_Author` for our model.

In [4]:
dataset.fit(users=(x['User_ID'] for x in get_data()),
            items=(x['Unique_ISBN'] for x in get_data()),
            item_features=(x['Book_Author'] for x in get_data())
            )

## Build interaction matrices

In [5]:
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

Num users: 17511, num_items 13607.


In [6]:
(interactions, weights) = dataset.build_interactions(((x['User_ID'], x['Unique_ISBN'])
                                                      for x in get_data()))

In [7]:
print(repr(interactions))

<17511x13607 sparse matrix of type '<class 'numpy.int32'>'
	with 151324 stored elements in COOrdinate format>


In [8]:
item_features = dataset.build_item_features(((x['Unique_ISBN'], [x['Book_Author']])
                                              for x in get_data()))
print(repr(item_features))

<13607x18702 sparse matrix of type '<class 'numpy.float32'>'
	with 27407 stored elements in Compressed Sparse Row format>


## Split train and test interactions

In [9]:
train_interactions, test_interactions = cross_validation.random_train_test_split(interactions, random_state=np.random.RandomState(seed=11232))

train_weights, test_weights = cross_validation.random_train_test_split(weights, random_state=np.random.RandomState(seed=11232))

# Hyperparameters tuning

We will perform random search for our hyperparameters tuning

In [69]:
import itertools

def sample_hyperparameters():
    """
    Yield possible hyperparameter choices.
    """

    while True:
        yield {
            "no_components": np.random.randint(16, 64),
            "loss": np.random.choice(["warp","bpr"]),
            "learning_rate": np.random.exponential(0.05),
            "num_epochs": np.random.randint(5, 50),
            "random_state":np.random.RandomState(seed=11232)
        }


def random_search(train, test, num_samples=10):
    """
    Sample random hyperparameters, fit a LightFM model, and evaluate it
    on the test set.

    Parameters
    ----------

    train: np.float32 coo_matrix of shape [n_users, n_items]
        Training data.
    test: np.float32 coo_matrix of shape [n_users, n_items]
        Test data.
    num_samples: int, optional
        Number of hyperparameter choices to evaluate.


    Returns
    -------

    generator of (auc_score, hyperparameter dict, fitted model)

    """

    for hyperparams in itertools.islice(sample_hyperparameters(), num_samples):
        num_epochs = hyperparams.pop("num_epochs")

        model_tune = LightFM(**hyperparams)
        model_tune.fit(interactions=train, epochs=num_epochs, item_features=item_features,sample_weight=train_weights)

        score = auc_score(model_tune, train, item_features=item_features).mean()
        auc_test = auc_score(model_tune, test, item_features=item_features).mean()

        hyperparams["num_epochs"] = num_epochs

        yield (score, auc_test, hyperparams, model_tune)

In [12]:
(score, auc_test, hyperparams, model_tune) = max(random_search(train_interactions, test_interactions), key=lambda x: x[0])

print("Best score {} at {}".format(score, hyperparams))
print("Best test {} at {}".format(auc_test, hyperparams))

Best score 0.9985889792442322 at {'no_components': 55, 'loss': 'warp', 'learning_rate': 0.09824398527330847, 'random_state': RandomState(MT19937) at 0x7FAB29861440, 'num_epochs': 35}
Best test 0.7623966336250305 at {'no_components': 55, 'loss': 'warp', 'learning_rate': 0.09824398527330847, 'random_state': RandomState(MT19937) at 0x7FAB29861440, 'num_epochs': 35}


> The results of test and train score above depict that our model is prone to overfitting. Therefore, we will use a lower number of components and learning rate with a lower number of epoch to minimize the difference between AUC score of train and test set

In [31]:
best_model = LightFM(loss='warp',learning_rate=0.01824398527330847,no_components=45)
best_model.fit(
    interactions=train_interactions,
    item_features=item_features, sample_weight=train_weights,
    epochs=10, verbose=True)

Epoch: 100%|██████████| 10/10 [00:03<00:00,  2.72it/s]


<lightfm.lightfm.LightFM at 0x7fab2a7c7910>

# Evaluation (AUC score, Precision, and Recall)

In [32]:
auc_train = auc_score( 
        best_model, train_interactions, 
        item_features=item_features).mean()
auc_test = auc_score( 
        best_model, test_interactions, 
        item_features=item_features).mean()
print(auc_train)
print(auc_test)

0.86399215
0.77531123


The difference is 0.09 which is acceptable

In [33]:
train_precision = precision_at_k(best_model, train_interactions, k=10,item_features=item_features).mean()
test_precision = precision_at_k(best_model, test_interactions, k=10,item_features=item_features).mean()

print(train_precision)
print(test_precision)

0.038148835
0.009532729


In [65]:
train_recall = recall_at_k(best_model, train_interactions, k=10,item_features=item_features).mean()
test_recall = recall_at_k(best_model, test_interactions, k=10,item_features=item_features).mean()

print(train_recall)
print(test_recall)

0.0820300077883452
0.03666967454034619


# Save and Load model

In [34]:
with open('Hybrid_new2.pickle', 'wb') as fle:
    pickle.dump(best_model, fle, protocol=pickle.HIGHEST_PROTOCOL)

In [35]:
with open('Hybrid_new2.pickle', 'rb') as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    model_from_pickle = pickle.load(f)

In [36]:
score_from_pkl = auc_score( 
        model_from_pickle, interactions, 
        item_features=item_features).mean()
print(score_from_pkl)

0.84072757


# Apply

We will use the `recsys` predefined functions from the Recommender System cookbook to get some recommendations

In [37]:
df = pd.read_csv('Data/Cleaned_Data/book_user_explicit_rating_cleaned.csv',encoding='UTF-8')
df = df.drop(columns=['index'])
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df = df.drop(columns=['Image_URL','ISBN'])

In [39]:
# Function to create an interaction matrix dataframe from transactional type interactions
interactions_mtx = df.groupby(['User_ID', 'Unique_ISBN'])['Book_Rating'].sum().unstack().reset_index().fillna(0).set_index('User_ID')
interactions_mtx.head()

Unique_ISBN,0002005018,0002251760,0002550563,0003300277,000617616X,0006471641,0006480608,0006492347,0006551971,0006742939,...,9580464162,958704049X,9681500555,9681500830,9681500954,9684068573,9722105248,9726101794,9871138148,B00009ANY9
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
69,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
interactions_mtx.shape

(17511, 13607)

In [41]:
_books = df.drop(columns=['User_ID','Age','Age_Range','Country'])

In [42]:
# Function to create a user dictionary based on their index and number in interaction dataset
user_id = list(interactions_mtx.index)
user_dict = {}
counter = 0 
for i in user_id:
    user_dict[i] = counter
    counter += 1

In [43]:
# Function to create an item dictionary based on their item_id and item name
_books = _books.reset_index()
item_dict ={}
for i in range(_books.shape[0]):
    item_dict[(_books.loc[i,'Unique_ISBN'])] = _books.loc[i,'Book_Title']

In [46]:
def sample_recommendation_user(model, interactions, user_id, user_dict, 
                               item_dict,threshold = 0,nrec_items = 10, show = True):
    '''
    Function to produce user recommendations
    Required Input - 
        - model = Trained matrix factorization model
        - interactions = dataset used for training the model
        - user_id = user ID for which we need to generate recommendation
        - user_dict = Dictionary type input containing interaction_index as key and user_id as value
        - item_dict = Dictionary type input containing item_id as key and item_name as value
        - threshold = value above which the rating is favorable in new interaction matrix
        - nrec_items = Number of output recommendation needed
    Expected Output - 
        - Prints list of items the given user has already bought
        - Prints list of N recommended items  which user hopefully will be interested in
    '''
    n_users, n_items = interactions.shape
    user_x = user_dict[user_id]
    scores = pd.Series(model.predict(user_x,np.arange(n_items)))
    scores.index = interactions.columns
    scores = list(pd.Series(scores.sort_values(ascending=False).index))
    
    known_items = list(pd.Series(interactions.loc[user_id,:] \
                                 [interactions.loc[user_id,:] > threshold].index) \
                                .sort_values(ascending=False))
    #print(known_items)
    
    scores = [x for x in scores if x not in known_items]
    return_score_list = scores[0:nrec_items]
    known_items = list(pd.Series(known_items).apply(lambda x: item_dict[x]))
    scores = list(pd.Series(return_score_list).apply(lambda x: item_dict[x]))
    if show == True:
        print("Known Likes:")
        counter = 1
        for i in known_items:
            #print(i)
            print(str(counter) + '- ' + i)
            counter+=1

        print("\n Recommended Items:")
        counter = 1
        for i in scores:
            #print(i)
            print(str(counter) + '- ' + i)
            counter+=1
    return return_score_list

In [47]:
rec_list = sample_recommendation_user(model = model_from_pickle, 
                                      interactions = interactions_mtx, 
                                      user_id = 900, 
                                      user_dict = user_dict,
                                      item_dict = item_dict, 
                                      threshold = 4,
                                      nrec_items = 10,
                                      show = True)

Known Likes:
1- WEST OF DODGE
2- VICE
3- MURDER AT THE KENNEDY CENTER (CAPITAL CRIME MYSTERIES)
4- DIVINE SECRETS OF THE YA-YA SISTERHOOD: A NOVEL

 Recommended Items:
1- GIRLFRIEND IN A COMA
2- A THIEF OF TIME: A NOVEL (HARPER NOVEL OF SUSPENSE)
3- TRIXIE BELDEN AND THE GATEHOUSE MYSTERY (GATEHOUSE MYSTERY)
4- PATRON SAINT OF LIARS : A NOVEL
5- ICE STATION
6- LINDA GOODMAN'S LOVE SIGNS : A NEW APPROACH TO THE HUMAN HEART
7- VITTORIO THE VAMPIRE: NEW TALES OF THE VAMPIRES
8- MONSTROUS REGIMENT (PRATCHETT, TERRY)
9- NO ONE WRITES TO THE COLONEL
10- MEN, WOMEN AND RELATIONSHIPS


In [48]:
def sample_recommendation_item(model,interactions,item_id,user_dict,item_dict,number_of_user):
    '''
    Funnction to produce a list of top N interested users for a given item
    Required Input -
        - model = Trained matrix factorization model
        - interactions = dataset used for training the model
        - item_id = item ID for which we need to generate recommended users
        - user_dict =  Dictionary type input containing interaction_index as key and user_id as value
        - item_dict = Dictionary type input containing item_id as key and item_name as value
        - number_of_user = Number of users needed as an output
    Expected Output -
        - user_list = List of recommended users 
    '''
    n_users, n_items = interactions.shape
    x = np.array(interactions.columns)
    scores = pd.Series(model.predict(np.arange(n_users), np.repeat(x.searchsorted(item_id),n_users)))
    user_list = list(interactions.index[scores.sort_values(ascending=False).head(number_of_user).index])
    return user_list

In [49]:
sample_recommendation_item(model = model_from_pickle,
                           interactions = interactions_mtx,
                           item_id = '0385504209',
                           user_dict = user_dict,
                           item_dict = item_dict,
                           number_of_user = 10)

[128601, 18095, 136117, 80954, 36441, 69684, 134630, 47929, 167562, 132572]

# Basic Factorization Machine for basic CF to get item-item recommendations

In [44]:
# Function to run matrix-factorization algorithm
x = sparse.csr_matrix(interactions_mtx.values)
model_new = LightFM(loss='warp',learning_rate=0.01824398527330847,no_components=45)
model_new.fit(x,epochs=100)

<lightfm.lightfm.LightFM at 0x7fab1a6d1a60>

In [63]:
def create_item_emdedding_distance_matrix(model,interactions):
    '''
    Function to create item-item distance embedding matrix
    Required Input -
        - model = Trained matrix factorization model
        - interactions = dataset used for training the model
    Expected Output -
        - item_emdedding_distance_matrix = Pandas dataframe containing cosine distance matrix b/w items
    '''
    df_item_norm_sparse = sparse.csr_matrix(model.item_embeddings)
    similarities = cosine_similarity(df_item_norm_sparse)
    item_emdedding_distance_matrix = pd.DataFrame(similarities)
    item_emdedding_distance_matrix.columns = interactions.columns
    item_emdedding_distance_matrix.index = interactions.columns
    return item_emdedding_distance_matrix

## Creating item-item distance matrix
item_item_dist = create_item_emdedding_distance_matrix(model = model_new,
                                                       interactions = interactions_mtx)
## Checking item embedding distance matrix
#item_item_dist.head()

In [64]:
def item_item_recommendation(item_emdedding_distance_matrix, item_id, 
                             item_dict, n_items = 10, show = True):
    '''
    Function to create item-item recommendation
    Required Input - 
        - item_emdedding_distance_matrix = Pandas dataframe containing cosine distance matrix b/w items
        - item_id  = item ID for which we need to generate recommended items
        - item_dict = Dictionary type input containing item_id as key and item_name as value
        - n_items = Number of items needed as an output
    Expected Output -
        - recommended_items = List of recommended items
    '''
    recommended_items = list(pd.Series(item_emdedding_distance_matrix.loc[item_id,:]. \
                                  sort_values(ascending = False).head(n_items+1). \
                                  index[1:n_items+1]))
    if show == True:
        print("Item of interest :{0}".format(item_dict[item_id]))
        print("Item similar to the above item:")
        counter = 1
        for i in recommended_items:
            print(str(counter) + '- ' +  item_dict[i])
            counter+=1
    return recommended_items

## Calling 5 recommended items for item id 
rec_list = item_item_recommendation(item_emdedding_distance_matrix = item_item_dist,
                                    item_id = '0385504209',
                                    item_dict = item_dict,
                                    n_items = 10)

Item of interest :THE DA VINCI CODE
Item similar to the above item:
1- ANGELS &AMP; DEMONS
2- THE DOGS OF BABEL (TODAY SHOW BOOK CLUB #12)
3- THE LOVELY BONES: A NOVEL
4- BLEACHERS
5- DIGITAL FORTRESS : A THRILLER
6- THE SECRET LIFE OF BEES
7- I DO (BUT I DON'T)
8- THE HOURS: A NOVEL
9- THE FIVE PEOPLE YOU MEET IN HEAVEN
10- NICE
