In [1]:
import os

def list_all_files_in(dirpath):
    for dirname, _, filenames in os.walk(dirpath):
        for filename in filenames:
            print(os.path.join(dirname, filename))

list_all_files_in('/kaggle/input')

/kaggle/input/movielens100k/movies.csv
/kaggle/input/movielens100k/ratings.csv
/kaggle/input/movielens100k/u.data
/kaggle/input/movielens100k/tags.csv
/kaggle/input/movielens100k/links.csv


# Imports

In [2]:

import pandas as pd


import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns


from collections import deque


from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer

# Recommender library
import surprise as sp
from surprise.model_selection import cross_validate, train_test_split

# Sparse matrices
from scipy.sparse import coo_matrix

# LightFM
from lightfm import LightFM
from lightfm.evaluation import precision_at_k

# Stacking sparse matrices
from scipy.sparse import vstack

# Displaying stuff
from IPython.display import display

import warnings; warnings.simplefilter('ignore')

In [3]:
ratings = pd.read_csv('/kaggle/input/movielens100k/ratings.csv')
ratings.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [4]:
reader = sp.Reader()
data = sp.Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [5]:
movies_df = pd.read_csv('../input/movielens100k/movies.csv')

movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)

movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())

movies_df['genres'] = movies_df.genres.str.split('|')
print(len(movies_df['movieId'].unique().tolist()))
movies_df.head()

9125


Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [6]:
movies_df.isna().sum()

movieId    0
title      0
genres     0
year       6
dtype: int64

In [7]:
movies_df[movies_df.isnull().any(axis=1)]

Unnamed: 0,movieId,title,genres,year
8505,108548,"Big Bang Theory, The (2007-)",[Comedy],
8507,108583,Fawlty Towers (1975-1979),[Comedy],
9017,143410,Hyena Road,[(no genres listed)],
9063,151307,The Lovers and the Despot,[(no genres listed)],
9118,162376,Stranger Things,[Drama],
9124,164979,"Women of '69, Unboxed",[Documentary],


In [8]:
movies_df['year'].fillna(0, inplace=True)
movies_df.loc[[8505, 8507, 9017, 9063, 9118, 9124]]

Unnamed: 0,movieId,title,genres,year
8505,108548,"Big Bang Theory, The (2007-)",[Comedy],0
8507,108583,Fawlty Towers (1975-1979),[Comedy],0
9017,143410,Hyena Road,[(no genres listed)],0
9063,151307,The Lovers and the Despot,[(no genres listed)],0
9118,162376,Stranger Things,[Drama],0
9124,164979,"Women of '69, Unboxed",[Documentary],0


In [9]:
movies_with_genres = movies_df.copy(deep=True)

x = []
for index, row in movies_df.iterrows():
    x.append(index)
    for genre in row['genres']:
        movies_with_genres.at[index, genre] = 1

movies_with_genres.fillna(0.0, inplace=True)
movies_with_genres.drop('genres', axis=1, inplace=True)
movies_with_genres.head()
#print(movies_with_genres.shape)

Unnamed: 0,movieId,title,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,...,Horror,Mystery,Sci-Fi,Documentary,IMAX,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,1995,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,1995,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,1995,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,1995,0.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,1995,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
def get_user_movie_ratings(uid):
    user_ratings = ratings[ratings['userId'] == uid]
    user_movie_ratings = pd.merge(movies_df, user_ratings, on='movieId')[['movieId', 'title', 'rating']]
    #print(user_movie_ratings)
    return user_movie_ratings

In [11]:
def get_user_genres(uid):
    user_movie_ratings = get_user_movie_ratings(uid)
    user_genres = movies_with_genres[movies_with_genres['movieId'].isin(user_movie_ratings['movieId'])]
    user_genres.reset_index(drop=True, inplace=True)
    user_genres.drop(['movieId', 'title', 'year'], axis=1, inplace=True)
    return user_genres

In [12]:
def content_based_recommender(uid):
    user_movie_ratings = get_user_movie_ratings(uid)
    user_genres_df = get_user_genres(uid)
    

    user_profile = user_genres_df.T.dot(user_movie_ratings['rating'])
    #print(user_profile)
    genres_df = movies_with_genres.copy(deep=True).set_index(movies_with_genres['movieId']).drop(['movieId', 'title', 'year'], axis=1)
    recommendation_df = (genres_df.dot(user_profile)) / user_profile.sum()
    return recommendation_df
#     recommendation_df.sort_values(ascending=False, inplace=True)
    

#     movies_copy = movies_df.copy(deep=True)
#     movies_copy.set_index('movieId', drop=True, inplace=True)
#     top_n_index = recommendation_df.index[:n].tolist()
#     results = movies_copy.loc[top_n_index, :]
#     results['weighted_average'] = recommendation_df[:n]
#     return results

In [13]:
ab=ratings['userId'].unique()
final_dict={}
for i in ab:
    rr=content_based_recommender(i)
    final_dict[i]=rr
    

In [28]:
df=pd.DataFrame.from_dict(final_dict)


In [29]:
df

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.369811,0.318881,0.274081,0.439483,0.432843,0.382979,0.440090,0.241379,0.248227,0.240654,...,0.354049,0.456053,0.208887,0.424833,0.292835,0.287037,0.296875,0.262590,0.220000,0.416840
2,0.249057,0.149650,0.122145,0.245985,0.179902,0.239953,0.238522,0.126146,0.101655,0.140187,...,0.188324,0.237148,0.122040,0.202403,0.133956,0.106481,0.088542,0.079137,0.116667,0.215506
3,0.101887,0.248951,0.201589,0.198590,0.362745,0.144208,0.195969,0.180707,0.269504,0.163551,...,0.248588,0.210614,0.105174,0.291589,0.218069,0.259259,0.187500,0.230216,0.176667,0.233014
4,0.237736,0.446154,0.396226,0.296514,0.508333,0.312057,0.310190,0.371454,0.534279,0.306075,...,0.419962,0.308458,0.238161,0.460080,0.355140,0.493827,0.479167,0.377698,0.390000,0.396832
5,0.075472,0.138462,0.129096,0.149236,0.214706,0.109929,0.146697,0.106067,0.122931,0.100467,...,0.145009,0.175788,0.074603,0.176769,0.143302,0.143519,0.161458,0.183453,0.090000,0.158816
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162672,0.309434,0.370629,0.332671,0.252644,0.368627,0.334515,0.294513,0.339590,0.456265,0.324766,...,0.365348,0.286899,0.243594,0.359680,0.308411,0.388889,0.348958,0.262590,0.356667,0.336807
163056,0.407547,0.225175,0.249255,0.347043,0.200980,0.395981,0.402016,0.261021,0.189125,0.392523,...,0.252354,0.417910,0.364175,0.232310,0.267913,0.171296,0.109375,0.284173,0.200000,0.275115
163949,0.000000,0.000000,0.021847,0.003134,0.010784,0.000000,0.000000,0.002619,0.000000,0.000000,...,0.000000,0.000000,0.000649,0.000000,0.015576,0.000000,0.000000,0.000000,0.000000,0.010004
164977,0.075472,0.138462,0.129096,0.149236,0.214706,0.109929,0.146697,0.106067,0.122931,0.100467,...,0.145009,0.175788,0.074603,0.176769,0.143302,0.143519,0.161458,0.183453,0.090000,0.158816


In [30]:
# copy the data
df_min_max_scaled = df.copy()
  
# apply normalization techniques
for column in df_min_max_scaled.columns:
    df_min_max_scaled[column] = ((df_min_max_scaled[column] - df_min_max_scaled[column].min()) / (df_min_max_scaled[column].max() - df_min_max_scaled[column].min())*4)+1    
  
# view normalized data
print(df_min_max_scaled)

              1         2         3         4         5         6         7    \
movieId                                                                         
1        3.118919  2.798817  2.565957  3.750000  3.253989  3.207836  3.757895   
2        2.427027  1.844181  1.697872  2.539216  1.936822  2.383305  2.494737   
3        1.583784  2.404339  2.151773  2.242647  2.888960  1.831346  2.228070   
4        2.362162  3.516765  3.263830  2.855392  3.647096  2.798978  2.943860   
5        1.432432  1.781065  1.737589  1.933824  2.118060  1.633731  1.919298   
...           ...       ...       ...       ...       ...       ...       ...   
162672   2.772973  3.090730  2.900709  2.580882  2.919592  2.928450  2.845614   
163056   3.335135  2.270217  2.424113  3.171569  2.046586  3.282794  3.519298   
163949   1.000000  1.000000  1.124823  1.019608  1.056158  1.000000  1.000000   
164977   1.432432  1.781065  1.737589  1.933824  2.118060  1.633731  1.919298   
164979   1.000000  1.000000 

In [31]:
df=df_min_max_scaled.transpose()

In [32]:
df

movieId,1,2,3,4,5,6,7,8,9,10,...,161830,161918,161944,162376,162542,162672,163056,163949,164977,164979
1,3.118919,2.427027,1.583784,2.362162,1.432432,2.556757,1.583784,2.059459,1.605405,3.183784,...,2.772973,3.227027,1.778378,1.778378,1.886486,2.772973,3.335135,1.000000,1.432432,1.000000
2,2.798817,1.844181,2.404339,3.516765,1.781065,2.309665,2.404339,1.575937,1.497041,2.404339,...,2.775148,2.112426,2.112426,2.112426,2.175542,3.090730,2.270217,1.000000,1.781065,1.000000
3,2.565957,1.697872,2.151773,3.263830,1.737589,2.492199,2.151773,1.527660,1.629787,2.446809,...,2.707801,2.407092,2.112057,2.112057,1.856738,2.900709,2.424113,1.124823,1.737589,1.124823
4,3.750000,2.539216,2.242647,2.855392,1.933824,2.470588,2.242647,2.125000,1.649510,2.742647,...,2.220588,2.931373,1.612745,1.612745,1.742647,2.580882,3.171569,1.019608,1.933824,1.019608
5,3.253989,1.936822,2.888960,3.647096,2.118060,1.643267,2.888960,1.668794,1.245054,1.857690,...,2.059349,1.857690,1.758137,1.758137,1.992980,2.919592,2.046586,1.056158,2.118060,1.056158
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,2.467456,1.544379,2.325444,3.524655,1.733728,2.285996,2.325444,1.386588,1.386588,2.136095,...,2.751479,1.717949,2.199211,2.199211,2.143984,2.988166,1.875740,1.000000,1.733728,1.000000
668,2.443038,1.430380,1.911392,3.329114,1.784810,2.443038,1.911392,1.227848,1.177215,1.784810,...,3.000000,1.455696,2.417722,2.417722,1.582278,2.696203,1.531646,1.000000,1.784810,1.000000
669,2.221757,1.368201,2.071130,2.757322,1.853556,2.489540,2.071130,1.317992,1.569038,2.556485,...,2.656904,2.573222,1.686192,1.686192,1.887029,2.221757,2.322176,1.000000,1.853556,1.000000
670,2.194570,1.633484,1.959276,3.117647,1.488688,2.610860,1.959276,1.452489,1.325792,2.339367,...,2.954751,1.995475,2.158371,2.158371,2.176471,2.936652,2.085973,1.000000,1.488688,1.000000
