In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder, RobustScaler, StandardScaler, MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error 
import tensorflow.compat.v1 as tf
from lightfm.data import Dataset
from lightfm import LightFM
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k, reciprocal_rank
from lightfm import cross_validation
import csv
import pickle



In [2]:
def get_data():
    return csv.DictReader(
            (x for x in open('Data/Cleaned_Data/book_user_explicit_rating_cleaned.csv','r'))
        )

In [3]:
dataset = Dataset()

In [4]:
dataset.fit(users=(x['User_ID'] for x in get_data()),
            items=(x['Unique_ISBN'] for x in get_data()),
            item_features=(x['Book_Author'] for x in get_data()),
            user_features=(x['Age_Range'] for x in get_data())
            )

In [5]:
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

Num users: 17511, num_items 13607.


In [6]:
(interactions, weights) = dataset.build_interactions(((x['User_ID'], x['Unique_ISBN'])
                                                      for x in get_data()))

In [7]:
print(repr(interactions))

<17511x13607 sparse matrix of type '<class 'numpy.int32'>'
	with 151324 stored elements in COOrdinate format>


In [8]:
item_features = dataset.build_item_features(((x['Unique_ISBN'], [x['Book_Author']])
                                              for x in get_data()))
print(repr(item_features))

<13607x18702 sparse matrix of type '<class 'numpy.float32'>'
	with 27407 stored elements in Compressed Sparse Row format>


In [9]:
user_features = dataset.build_user_features(((x['User_ID'], [x['Age_Range']])
                                              for x in get_data()))
print(repr(user_features))

<17511x17516 sparse matrix of type '<class 'numpy.float32'>'
	with 35022 stored elements in Compressed Sparse Row format>


In [10]:
train_interactions, test_interactions = cross_validation.random_train_test_split(interactions, random_state=np.random.RandomState(seed=11232))

train_weights, test_weights = cross_validation.random_train_test_split(weights, random_state=np.random.RandomState(seed=11232))

In [50]:
import itertools

def sample_hyperparameters():
    """
    Yield possible hyperparameter choices.
    """

    while True:
        yield {
            "no_components": np.random.randint(16, 64),
            "loss": np.random.choice(["warp"]),
            "learning_rate": np.random.exponential(0.05),
            "num_epochs": np.random.randint(5, 50),
            "random_state":np.random.RandomState(seed=11232)
        }


def random_search(train, test, num_samples=10):
    """
    Sample random hyperparameters, fit a LightFM model, and evaluate it
    on the test set.

    Parameters
    ----------

    train: np.float32 coo_matrix of shape [n_users, n_items]
        Training data.
    test: np.float32 coo_matrix of shape [n_users, n_items]
        Test data.
    num_samples: int, optional
        Number of hyperparameter choices to evaluate.


    Returns
    -------

    generator of (auc_score, hyperparameter dict, fitted model)

    """

    for hyperparams in itertools.islice(sample_hyperparameters(), num_samples):
        num_epochs = hyperparams.pop("num_epochs")

        model_tune = LightFM(**hyperparams)
        model_tune.fit(interactions=train, epochs=num_epochs, item_features=item_features, user_features=user_features,sample_weight=train_weights)

        score = auc_score(model_tune, train, item_features=item_features, user_features=user_features).mean()
        auc_test = auc_score(model_tune, test, item_features=item_features, user_features=user_features).mean()

        hyperparams["num_epochs"] = num_epochs

        yield (score, auc_test, hyperparams, model_tune)

In [51]:
(score, auc_test, hyperparams, model_tune) = max(random_search(train_interactions, test_interactions), key=lambda x: x[0])

print("Best score {} at {}".format(score, hyperparams))
print("Best test {} at {}".format(auc_test, hyperparams))

Best score 0.8279528617858887 at {'no_components': 33, 'loss': 'warp', 'learning_rate': 0.07855246734493881, 'random_state': RandomState(MT19937) at 0x7F92E68F6940, 'num_epochs': 9}
Best test 0.7305173277854919 at {'no_components': 33, 'loss': 'warp', 'learning_rate': 0.07855246734493881, 'random_state': RandomState(MT19937) at 0x7F92E68F6940, 'num_epochs': 9}


In [58]:
best_model = LightFM(loss='warp',learning_rate=0.07855246734493881,no_components=30)

In [59]:
best_model.fit(
    interactions=train_interactions,
    item_features=item_features,
    user_features=user_features, sample_weight=train_weights,
    epochs=5, verbose=True)

Epoch: 100%|██████████| 5/5 [00:01<00:00,  3.04it/s]


<lightfm.lightfm.LightFM at 0x7f92e6897e50>

In [60]:
auc_train = auc_score( 
        best_model, train_interactions, 
        item_features=item_features, 
        user_features=user_features).mean()
auc_test = auc_score( 
        best_model, test_interactions, 
        item_features=item_features, 
        user_features=user_features).mean()
print(auc_train)
print(auc_test)

0.7897221
0.7258238


In [61]:
score = auc_score( 
        best_model, interactions, 
        item_features=item_features, 
        user_features=user_features).mean()
print(score)

0.77702516


In [62]:
train_precision = precision_at_k(best_model, train_interactions, k=10,item_features=item_features,user_features=user_features).mean()
test_precision = precision_at_k(best_model, test_interactions, k=10,item_features=item_features,user_features=user_features).mean()

print(train_precision)
print(test_precision)

0.011165659
0.0043868297


# Save and Load model

In [63]:
with open('Hybrid.pickle', 'wb') as fle:
    pickle.dump(best_model, fle, protocol=pickle.HIGHEST_PROTOCOL)

In [64]:
with open('Hybrid.pickle', 'rb') as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    model_from_pickle = pickle.load(f)

In [65]:
score_from_pkl = auc_score( 
        model_from_pickle, interactions, 
        item_features=item_features, 
        user_features=user_features).mean()
print(score_from_pkl)

0.77702516


In [None]:
def sample_recommendation_user(model, interactions, user_id, user_dict, 
                               item_dict,threshold = 0,nrec_items = 10, show = True):
    '''
    Function to produce user recommendations
    Required Input - 
        - model = Trained matrix factorization model
        - interactions = dataset used for training the model
        - user_id = user ID for which we need to generate recommendation
        - user_dict = Dictionary type input containing interaction_index as key and user_id as value
        - item_dict = Dictionary type input containing item_id as key and item_name as value
        - threshold = value above which the rating is favorable in new interaction matrix
        - nrec_items = Number of output recommendation needed
    Expected Output - 
        - Prints list of items the given user has already bought
        - Prints list of N recommended items  which user hopefully will be interested in
    '''
    n_users, n_items = interactions.shape
    user_x = user_dict[user_id]
    scores = pd.Series(model.predict(user_x,np.arange(n_items)))
    scores.index = interactions.columns
    scores = list(pd.Series(scores.sort_values(ascending=False).index))
    
    known_items = list(pd.Series(interactions.loc[user_id,:] \
                                 [interactions.loc[user_id,:] > threshold].index) \
                                .sort_values(ascending=False))
    #print(known_items)
    
    scores = [x for x in scores if x not in known_items]
    return_score_list = scores[0:nrec_items]
    known_items = list(pd.Series(known_items).apply(lambda x: item_dict[x]))
    scores = list(pd.Series(return_score_list).apply(lambda x: item_dict[x]))
    if show == True:
        print("Known Likes:")
        counter = 1
        for i in known_items:
            #print(i)
            print(str(counter) + '- ' + i)
            counter+=1

        print("\n Recommended Items:")
        counter = 1
        for i in scores:
            #print(i)
            print(str(counter) + '- ' + i)
            counter+=1
    return return_score_list