In [None]:
import pandas as pd 
import numpy as np
from lightfm import LightFM, cross_validation
from lightfm.evaluation import precision_at_k, auc_score
from lightfm.data import Dataset
import pickle 




In [None]:
df1 = pd.read_csv('ratings.csv')
df2 = pd.read_csv('movies.csv')
df3 = df2.merge(df1, on = 'movieId' ).dropna()
df = df3[['userId','movieId','rating']]
df4 = df3[['userId','movieId','rating','title','genres']]


In [None]:
rating_df = df2.merge(df1, on = 'movieId' ).dropna()
rating_df.head()


In [None]:
df4 = df3[['userId','movieId','rating','title','genres']]
df4.head()

In [None]:
data = Dataset()
data.fit(df.userId.unique(), df.movieId.unique())

In [None]:
interactions_matrix, weights_matrix = data.build_interactions([tuple(i) for i in df.values])

In [None]:
train, test = cross_validation.random_train_test_split(interactions_matrix, test_percentage=0.25, random_state=2022)

In [None]:
train_weight , test_weight = cross_validation.random_train_test_split(weights_matrix, test_percentage=0.25, random_state=2022)

In [None]:
model_wrap = LightFM(loss='warp', learning_rate=0.01, k=10)
model_wrap.fit(train, sample_weight = train_weight , epochs=150)

In [None]:
train_precision = precision_at_k(model_wrap, train, k=10).mean()
test_precision = precision_at_k(model_wrap, test, k=10, train_interactions=train).mean()

train_auc = auc_score(model_wrap, train).mean()
test_auc = auc_score(model_wrap, test,train_interactions=train).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))

In [None]:
'''model_wrap = LightFM(loss='warp', learning_rate=0.01, k=10)
model_wrap.fit(train, sample_weight = train_weight , epochs=50)'''

In [None]:
print(repr(train))

In [None]:
uf = [] 
col = ['movieId']*len(df4['movieId'].unique()) + ['rating']*len(df4['rating'].unique()) + ['title']*len(df4['title'].unique()) + ['genres']*len(df4['genres'].unique()) 
unique_f1 = list(df4['movieId'].unique()) + list(df4['rating'].unique()) + list(df4['title'].unique()) + list(df4['genres'].unique()) 
#print('f1:', unique_f1)
for x,y in zip(col, unique_f1):
    res = str( x)+ ":" +str(y)
    uf.append(res)
    #print(res)
                   

In [None]:
data = Dataset()
#data.fit(df.userId.unique(), df.movieId.unique())
data.fit( 
        df4['userId'].unique(), 
        df4.movieId.unique(), # tous les éléments
        user_features = uf # fonctionnalités utilisateur supplémentaires
 )

In [None]:
interactions_matrix, weights_matrix = data.build_interactions([tuple(i) for i in df.values])

In [None]:
train, test = cross_validation.random_train_test_split(interactions_matrix, test_percentage=0.25, random_state=2022)

In [None]:
train_weight , test_weight = cross_validation.random_train_test_split(weights_matrix, test_percentage=0.25, random_state=2022)

In [None]:
df4.head()

In [None]:
# Helper function that takes the user features and converts them into the proper "feature:value" format
def feature_colon_value(my_list):
    """
    Takes as input a list and prepends the columns names to respective values in the list.
    For example: if my_list = [1,1,0,'del'],
    resultant output = ['movieId','rating','title','genres']
   
    """
    result = []
    ll = ['movieId:','rating:','title:','genres:']
    aa = my_list
    for x,y in zip(ll,aa):
        res = str(x) +""+ str(y)
        result.append(res)
    return result
# Using the helper function to generate user features in proper format for ALL users
ad_subset = df4[['movieId','rating','title','genres']] 
ad_list = [list(x) for x in ad_subset.values]
feature_list = []
for item in ad_list:
    feature_list.append(feature_colon_value(item))
#print(f'Final output: {feature_list}')

In [None]:
user_tuple = list(zip(df['userId'], feature_list))

In [None]:
user_features = data.build_user_features(user_tuple, normalize= False)

In [None]:
user_id_map, user_feature_map, movie_id_map, movie_feature_map = data.mapping()
user_feature_map

In [None]:
model = LightFM(loss='warp')
model.fit(interactions_matrix,
      user_features= user_features,
      sample_weight= weights_matrix,
      epochs=1)

In [None]:
train_auc = auc_score(model,
                      interactions_matrix,
                      user_features=user_features
                     ).mean()
print('Hybrid training set AUC: %s' % train_auc)


In [None]:
# predict for existing user
user_x = user_id_map[7]
n_users, n_items = interactions_matrix.shape # no of users * no of items
model.predict(user_x, np.arange(n_items)) # means predict for all 

In [None]:
user_feature_list = ['movieId:1', 'rating:1', 'title:Toy Story (1995)', 'genres:Adventure|Animation|Children|Comedy|Fantasy']

In [None]:
from scipy import sparse
def format_newuser_input(user_feature_map, user_feature_list):
    num_features = len(user_feature_list)
    normalised_val = 1.0 
    target_indices = []
    for feature in user_feature_list:
    try:
        target_indices.append(user_feature_map[feature])
    except KeyError:
        print("new user feature encountered '{}'".format(feature))
        pass

    new_user_features = np.zeros(len(user_feature_map.keys()))
    for i in target_indices:
    new_user_features[i] = normalised_val
    new_user_features = sparse.csr_matrix(new_user_features)
    return(new_user_features)

In [None]:
new_user_features = format_newuser_input(user_feature_map, user_feature_list)
model.predict(0, np.arange(n_items), user_features=new_user_features)

In [None]:
user_id = 1
best_rated = rating_df[(rating_df.userId == user_id) & (rating_df.rating >= 4.5)].movieId.values

#known_positives = rating_df.loc[rating_df['movieId'].isin(best_rated)].title.values
best_rated

In [None]:
scores = model.predict(user_id, np.arange(n_items), user_features=user_features) 
top_items = rating_df['title'][np.argsort(-scores)]

In [None]:
def recommend(model, user_id):
    n_users, n_items = train.shape
    user_id_map, user_feature_map, movie_id_map, movie_feature_map = data.mapping()

    user_feature_list = ['movieId:1', 'rating:1', 'title:Toy Story (1995)', 'genres:Adventure|Animation|Children|Comedy|Fantasy']
    new_user_features = format_newuser_input(user_feature_map, user_feature_list)


    best_rated = rating_df[(rating_df.userId == user_id) & (rating_df.rating >= 4.5)].movieId.values
    if best_rated.shape[0] == 0 :
        scores = model.predict(0, np.arange(n_items), user_features=new_user_features) 
        top_items = rating_df['title'][np.argsort(-scores)]
        print("\nRecommended:")
        for x in top_items[:10]:
            print(x)

    else : 
        known_positives = rating_df.loc[rating_df['movieId'].isin(best_rated)].title.values
        scores = model.predict(user_id, np.arange(n_items)) 
        top_items = rating_df['title'][np.argsort(-scores)]
        ls = []
        for x in known_positives:
            if x not in ls :
                ls.append(x)
                
        print("User %s likes:" % user_id)
        for k in ls:
            print(k)
            
        print("\nRecommended:")
        for x in top_items[:10]:
            print(x)
    


In [None]:
recommend(model,448)

In [None]:
user_feature_list = ['movieId:1', 'rating:1', 'title:Toy Story (1995)', 'genres:Adventure|Animation|Children|Comedy|Fantasy']
new_user_features = format_newuser_input(user_feature_map, user_feature_list)


best_rated = rating_df[(rating_df.userId == user_id) & (rating_df.rating >= 4.5)].movieId.values

In [None]:
best_rated.shape[0] == 0

In [None]:
scores = model.predict(0, np.arange(n_items), user_features=new_user_features)

In [None]:
scores

In [None]:
top_items = rating_df['title'][np.argsort(-scores)]
print(top_items.to_string())

In [None]:
for x in top_items[:10]:
        print(x)

In [None]:

n_users, n_items = train.shape
user_id_map, user_feature_map, movie_id_map, movie_feature_map = data.mapping()

user_feature_list = ['movieId:1', 'rating:1', 'title:Toy Story (1995)', 'genres:Adventure|Animation|Children|Comedy|Fantasy']
new_user_features = format_newuser_input(user_feature_map, user_feature_list)


best_rated = rating_df[(rating_df.userId == 1) & (rating_df.rating >= 4.5)].movieId.values
if best_rated.shape[0] == 0 :
    scores = model.predict(0, np.arange(n_items), user_features=new_user_features) 
    top_items = rating_df['title'][np.argsort(-scores)]        
    print("\nRecommended:")
    for x in top_items[:10]:
        print(x)

In [None]:
best_rated = rating_df[(rating_df.userId == 7) & (rating_df.rating >= 4.5)].movieId.values
best_rated

In [None]:
known_positives = rating_df.loc[rating_df['movieId'].isin(best_rated)].title.values
ls = []
for x in known_positives:
    if x not in ls :
        ls.append(x)
ls