In [1]:
pip install surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp39-cp39-linux_x86_64.whl size=3195812 sha256=8dced0afb08b44653b5c747346785c4a685e58bfacf28c461600697b9117c8f8
  Stored in directory: /root/.cache/pip/wheels/c6/3a/46/9b17b3512bdf283c6cb84f59929cdd5199d4e754d596d22784
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1


In [2]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import train_test_split
from surprise import accuracy
import os
from surprise import BaselineOnly, Dataset, Reader
from surprise.model_selection import cross_validate
import random
import pandas as pd
import numpy as np

In [3]:
filePath = os.path.expanduser("ratings.dat")

reader = 0
try:
    reader = Reader(line_format="user item rating timestamp", sep="::")
    data = Dataset.load_from_file(filePath, reader=reader)
except:
    reader = Reader(line_format="user item rating timestamp", sep=":")
    data = Dataset.load_from_file(filePath, reader=reader)

#random.shuffle(data)

train_set, test_set = train_test_split(data, test_size=.20)


#algo = SVD(n_factors=20, n_epochs=20, biased=False)
algo = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(train_set)
predictions = algo.test(test_set)

In [4]:
def CFBased(userNum):
    userNum = int(userNum)
    ratings = pd.read_csv('ratings.csv')
    movies = pd.read_csv('movies.csv')
    movies.set_index('MovieID')
    ratings = pd.merge(movies,ratings)
    userRatings = ratings.pivot_table(index=['UserID'],columns=['Title'], values='Rating')
    userRatings = userRatings.fillna(0)

    if(len(userRatings) <= userNum-1):
        userRatings.loc[len(userRatings) + 1] = 0

    userRatings.loc[userNum] = userRatings.loc[userNum].mask(userRatings.loc[userNum]!=0)
    #print(userRatings.loc[userNum, '12 Angry Men (1957)'])
    
    for (columnName, columnData) in userRatings.items():
        if(userRatings.loc[userNum, columnName] == 0):
            movieID = movies.loc[movies['Title'] == columnName,'MovieID'].iloc[0]
            predRate = algo.predict(str(userNum), str(movieID))
            userRatings.loc[userNum, columnName] = predRate.est
    
    oneUser = userRatings
    oneUser = oneUser.T
    oneUser = oneUser[[userNum]]
    oneUser = oneUser.fillna(0)
    oneUser = oneUser.sort_values(by=[userNum], ascending=False)
    oneUser['index'] = range(1, len(oneUser) + 1)
    oneUser.set_index('index')
    
    return oneUser
        


In [5]:
def CBBased(userNum):
    userNum = int(userNum)
    movies = pd.read_csv('movies.csv')
    ratings = pd.read_csv('ratings.csv')
    movies['Genres'] = movies['Genres'].str.split('|')
    movies.head()
    
    moviesGenres = movies.copy(deep=True)

    genreList = [] 

    exist = 0
    for ind in movies.index:
        for eachGenres in movies['Genres'][ind]:
            moviesGenres.at[ind, eachGenres] = 1
            for genre in genreList:
                if eachGenres == genre:
                    exist = 1
            if(exist == 0):
                genreList.append(eachGenres)
            else:
                exist = 0

    moviesGenres = moviesGenres.fillna(0)    
    genreMatrix = moviesGenres[genreList].to_numpy()
    userRatings = ratings[ratings['UserID'] == userNum]
    if len(userRatings.index) > 10:
        userPref = userRatings.sample(frac=0.3, random_state=1).reset_index(drop=True)
    else:
        userPref = userRatings.sample(frac=1, random_state=1).reset_index(drop=True)
    
    movieRatings = pd.merge(userPref, moviesGenres)
    movieRatings = movieRatings[genreList]
    weight = userPref['Rating']/userPref['Rating'].sum()
    userProf = movieRatings.T.dot(weight)
    userProfNorm = (userProf / userProf.sum()).sort_values()

    moviesGenres = moviesGenres[genreList]

    userProfNorm_matrix = np.array([userProfNorm] * len(moviesGenres))
    results = np.multiply(userProfNorm_matrix, np.array(moviesGenres))
    results = results.sum(axis=1)

    recommendTable = movies[['MovieID','Title']]
    recommendTable['Rating'] = results

    recommendTable = recommendTable.sort_values(by=['Rating'], ascending = False).reset_index(drop=True)
    recommendTable.index = recommendTable.index + 1
    
    return recommendTable

In [6]:
def coldStart(selectGenres):
    movies = pd.read_csv('movies.csv')
    ratings = pd.read_csv('ratings.csv')
    movies['Genres'] = movies['Genres'].str.split('|')
    
    moviesGenres = movies.copy(deep=True)

    genreList = [] 

    exist = 0
    for ind in movies.index:
        for eachGenres in movies['Genres'][ind]:
            moviesGenres.at[ind, eachGenres] = 1
            for genre in genreList:
                if eachGenres == genre:
                    exist = 1
            if(exist == 0):
                genreList.append(eachGenres)
            else:
                exist = 0

    moviesGenres = moviesGenres.fillna(0)    
    genreMatrix = moviesGenres[genreList].to_numpy()

    genrePicks = pd.DataFrame(columns=genreList)
    genrePicks.loc[len(genrePicks.index)] = 0

    for elem in genreList:
        for genre in selectGenres:
            if(genre == elem):
               genrePicks.at[0,genre] = 1

    genrePicks = genrePicks.reset_index(drop=True)
    genrePicks = genrePicks.T
    
    userProfNorm = (genrePicks / genrePicks.sum()).sort_values(by=[0])
    userProfNorm = userProfNorm[0]
    
    moviesGenres = moviesGenres[genreList]
    
    userProfNorm_matrix = np.array([userProfNorm] * len(moviesGenres))
    results = np.multiply(userProfNorm_matrix, np.array(moviesGenres))
    results = results.sum(axis=1)

    recommendTable = movies[['MovieID','Title']]
    recommendTable['Rating'] = results

    recommendTable = recommendTable.sort_values(by=['Rating'], ascending = False).reset_index(drop=True)
    recommendTable.index = recommendTable.index + 1
    
    return recommendTable

In [7]:
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')
ratings.head(5)

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [16]:
movies.head(5)
print(type(movies))
newMovies = np.array(movies)
print(type(newMovies))

<class 'pandas.core.frame.DataFrame'>
<class 'numpy.ndarray'>


In [9]:
movies.set_index('MovieID')
ratings = pd.merge(movies,ratings)
userRatings = ratings.pivot_table(index=['UserID'],columns=['Title'], values='Rating')
userRatings = userRatings.fillna(0)

user = 6041 #6041 will be the new user. Anything not 6041 is a predefined user and the maximum number of users are 6040

In [10]:
userGenres = ['Action', 'Adventure', 'Comedy', 'Romance']

In [11]:



if(len(userRatings.index) <= user-1): #cold start
    selectGenres = np.array(userGenres) #Example Genre Selected CHANGE THIS
    combinedRecommend = coldStart(selectGenres)
    display(combinedRecommend) #I displayed the recommendation here tweak this for exporting purposes
else: #not cold start
    CBrec = CBBased(user)
    CFrec = CFBased(user)
    
    display("CB",CBrec)
    display("CF",CFrec)
    
    combinedRecommend = CBrec.copy(deep=True)

    for title in combinedRecommend['Title']:
        contentB = CBrec.loc[CBrec['Title'] == title]
        rankCB = contentB['Rating'].values[0]
        indexCB = contentB['Rating'].index[0]
        try:
            rankCF = CFrec.loc[title].values[0]
        except:
            continue
        indexCF = CFrec.loc[title].values[1]
        compounded = (rankCB * 0.8) + (rankCF * 0.2)
        combinedRecommend.at[indexCB, 'Rating'] = compounded

    combinedRecommend = combinedRecommend.sort_values(by=['Rating'], ascending = False).reset_index(drop=True)
    
    display(combinedRecommend) #I displayed the recommendation here tweak this for exporting purposes
    

Unnamed: 0,MovieID,Title,Rating
1,913,"Maltese Falcon, The (1941)",0.5
2,1252,Chinatown (1974),0.5
3,1617,L.A. Confidential (1997),0.5
4,942,Laura (1944),0.5
5,164,Devil in a Blue Dress (1995),0.5
...,...,...,...
3879,1389,Jaws 3-D (1983),0.0
3880,1390,My Fellow Americans (1996),0.0
3881,1391,Mars Attacks! (1996),0.0
3882,1392,Citizen Ruth (1996),0.0


In [18]:
tmdb = pd.read_csv('links.csv')
movies['tmdbID'] = 0
for movieID in movies['MovieID']:
    rowData = movies.loc[movies['MovieID']
                              == movieID].index
    try:
        movieTmdb = tmdb['tmdbId'].loc[tmdb['movieId'] == movieID].values[0]
        movies.at[rowData[0], 'tmdbID'] = int(movieTmdb)
    except:
        movies = movies.drop([rowData[0]])
        continue
        
movies.head(5)

Unnamed: 0,MovieID,Title,Genres,tmdbID
0,1,Toy Story (1995),Animation|Children's|Comedy,862
1,2,Jumanji (1995),Adventure|Children's|Fantasy,8844
2,3,Grumpier Old Men (1995),Comedy|Romance,15602
3,4,Waiting to Exhale (1995),Comedy|Drama,31357
4,5,Father of the Bride Part II (1995),Comedy,11862


In [19]:
movies.to_csv('movies.csv', index=False)

In [20]:
new_movies = pd.read_csv('movies.csv')
new_movies.head()

Unnamed: 0,MovieID,Title,Genres,tmdbID
0,1,Toy Story (1995),Animation|Children's|Comedy,862
1,2,Jumanji (1995),Adventure|Children's|Fantasy,8844
2,3,Grumpier Old Men (1995),Comedy|Romance,15602
3,4,Waiting to Exhale (1995),Comedy|Drama,31357
4,5,Father of the Bride Part II (1995),Comedy,11862


In [12]:
newCombined = combinedRecommend.copy(deep=True)
tmdb = pd.read_csv('links.csv')
newCombined['tmdbID'] = 0
for movieID in newCombined['MovieID']:
    rowData = newCombined.loc[combinedRecommend['MovieID']
                              == movieID].index
    try:
        movieTmdb = tmdb['tmdbId'].loc[tmdb['movieId'] == movieID].values[0]
        newCombined.at[rowData[0], 'tmdbID'] = int(movieTmdb)
    except:
        newCombined = newCombined.drop([rowData[0]])
        continue
        
newCombined


Unnamed: 0,MovieID,Title,Rating,tmdbID
1,913,"Maltese Falcon, The (1941)",0.5,963
2,1252,Chinatown (1974),0.5,829
3,1617,L.A. Confidential (1997),0.5,2118
4,942,Laura (1944),0.5,1939
5,164,Devil in a Blue Dress (1995),0.5,8512
...,...,...,...,...
3879,1389,Jaws 3-D (1983),0.0,17692
3880,1390,My Fellow Americans (1996),0.0,17795
3881,1391,Mars Attacks! (1996),0.0,75
3882,1392,Citizen Ruth (1996),0.0,13891


In [13]:
newCombined.head(5).to_dict(orient='records')

[{'MovieID': 913,
  'Title': 'Maltese Falcon, The (1941)',
  'Rating': 0.5,
  'tmdbID': 963},
 {'MovieID': 1252, 'Title': 'Chinatown (1974)', 'Rating': 0.5, 'tmdbID': 829},
 {'MovieID': 1617,
  'Title': 'L.A. Confidential (1997)',
  'Rating': 0.5,
  'tmdbID': 2118},
 {'MovieID': 942, 'Title': 'Laura (1944)', 'Rating': 0.5, 'tmdbID': 1939},
 {'MovieID': 164,
  'Title': 'Devil in a Blue Dress (1995)',
  'Rating': 0.5,
  'tmdbID': 8512}]