# Učitavanje podataka i biblioteka

In [None]:
import pandas as pd
import seaborn as sns
import math
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

from scipy.stats.stats import pearsonr
from scipy import spatial

from IPython.display import Image

In [None]:
!pip install scikit-surprise



In [None]:
from surprise import SVD, Reader, accuracy
from surprise import Dataset
from surprise.model_selection import cross_validate

from surprise.model_selection import train_test_split

from collections import defaultdict

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
dm = pd.read_csv('drive/MyDrive/SAIS/PROJEKAT/ml-latest-small/movies.csv')

In [None]:
dr = pd.read_csv('drive/MyDrive/SAIS/PROJEKAT/ml-latest-small/ratings.csv')

# Data wrangling

In [None]:
#Pretvaramo 'genres' atribut u niz stringova
dm['genres'] = dm['genres'].apply(lambda x: x.split('|'))

#Dodajemo poseban atribut za svaki zanr, koji ce imati vrednost 0 ili 1
genres_flat_list = list(set([item for sublist in dm['genres'] for item in sublist]))
for item in genres_flat_list:
  dm['' + str(item)] = 0

#Postavljamo vrednosti tih atributa adekvatno
for i in range(len(dm['genres'])):
  for genre in dm['genres'][i]:
    dm['' + str(genre)][i] = 1

#Uklanjamo genres atribut
dm.drop(axis=1, columns=['genres'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [None]:
print('Number of ratings: ', dr.shape[0])
print('Number of users: ', len(dr['userId'].unique()))
print('Number of movies: ', dm.shape[0])

Number of ratings:  100836
Number of users:  610
Number of movies:  9742


In [None]:
NUM_OF_MOVIES = dm.shape[0]
NUM_OF_USERS = len(dr['userId'].unique())

In [None]:
# Uklanjamo rejtinge koji se odnose na filmove koji nisu u bazi
dr = dr[dr['movieId'] < NUM_OF_MOVIES]

# Content Based

Weighted Rating (WR) = $(\frac{v}{v+m}R) + (\frac{m}{v+m}C)$

Gde je:


*   m - treshold minimalno dozvoljenih rejtinga za jedan film
*   C - srednja vrednost svih rejtinga
*   v - broj rejtinga tog filma
*   R - srednja vrednost rejtinga tog filma   



In [None]:
mrc = [] #Movie Rating Count (MRC)

for i in range(dm.shape[0]):
  mrc.append(0)

for i in dr['movieId']:
  if i <= len(mrc):
    mrc[i - 1] += 1

In [None]:
MIN_REVIEW = int(sum(mrc)/len(mrc))

m = MIN_REVIEW

C = dr['rating'].mean()

In [None]:
def weighted_rating(x, m=m, C=C):
    v = len(dr[dr['movieId'] == x])
    R = dr[dr['movieId'] == x]['rating'].mean()

    WR = (v / (m + v)) * R + (m / (m + v)) * C
    return WR

In [None]:
def CB_suggest(genre_list):
    dmr = dm.copy()
    dmr = dmr[dmr[genre_list].sum(axis=1) == len(genre_list)]

    dmr = dmr[['movieId', 'title'] + genre_list]
    
    dmr['rating'] = dmr['movieId'].apply(lambda x: weighted_rating(x))
    dmr['count'] = dmr['movieId'].apply(lambda x: dr[dr['movieId'] == x]['rating'].count()) 
    
    suggestion = dmr[dmr['count'] > MIN_REVIEW].sort_values('rating', ascending=False)

    return suggestion

In [None]:
genre_list = [x for x in input().split()]

suggestion = CB_suggest(genre_list)

suggestion.head()

Comedy Drama Romance
['Comedy', 'Drama', 'Romance']


Unnamed: 0,movieId,title,Comedy,Drama,Romance,rating,count
314,356,Forrest Gump (1994),1,1,1,4.148444,329
680,898,"Philadelphia Story, The (1940)",1,1,1,4.135825,29
935,1235,Harold and Maude (1971),1,1,1,4.103691,26
1730,2324,Life Is Beautiful (La Vita è bella) (1997),1,1,1,4.094016,88
946,1247,"Graduate, The (1967)",1,1,1,4.011787,79


# Collaborative Filtering

In [None]:
# Formiranje i popunjavanje rating matrice
rating_matrix = pd.DataFrame(index=dr['userId'].unique(), columns=dr['movieId'].unique())
rating_matrix[:] = 0

for item in dr.itertuples():
    rating_matrix.loc[item[1], item[2]] = item[3]

In [None]:
def CF_suggest(USER_ID):
  sim = []
  for i in range(len(rating_matrix)):
    sim.append(1 - spatial.distance.cosine(rating_matrix.iloc[USER_ID - 1, :], rating_matrix.iloc[i, :]))

  df_sim = pd.Series(sim, index=rating_matrix.index)
  df_sim = df_sim.sort_values(ascending=False)

  top_sim_user = df_sim.index[1]

  cf_sim = rating_matrix.loc[[top_sim_user,USER_ID]].T
  cf_sim.columns = ['top_sim_user','user']

  top_sim_user_avg = cf_sim[cf_sim['top_sim_user'] > 0]['top_sim_user'].mean()
  cf_sim['recommend'] = (cf_sim['top_sim_user'] > top_sim_user_avg) & (cf_sim['user'] == 0)   

  cf_sim[cf_sim['recommend'] == True].sort_values('top_sim_user', ascending=False)

  recommended_movies = list(cf_sim[cf_sim['recommend']].sort_values('top_sim_user', ascending = False).index)

  result = dm[dm['movieId'].isin(recommended_movies)].copy()
  result['rating'] = dr[(dr['movieId'].isin(recommended_movies)) & (dr['userId'] == top_sim_user)]['rating'].values

  result = result.sort_values('rating', ascending = False)

  return result

In [None]:
USER_ID = int(input())
cf_suggestion = CF_suggest(USER_ID)

cf_suggestion.head()

19


Unnamed: 0,movieId,title,(no genres listed),Mystery,Drama,Comedy,Musical,Romance,Fantasy,Documentary,...,Children,Crime,War,IMAX,Action,Adventure,Film-Noir,Thriller,Sci-Fi,rating
2410,3200,"Last Detail, The (1973)",0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.0
520,608,Fargo (1996),0,0,1,1,0,0,0,0,...,0,1,0,0,0,0,0,1,0,5.0
1232,1641,"Full Monty, The (1997)",0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.0
2543,3404,Titanic (1953),0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,5.0
742,969,"African Queen, The (1951)",0,0,0,1,0,1,0,0,...,0,0,1,0,0,1,0,0,0,4.0


# Postava za korišćenje modela

In [None]:
reader = Reader()
data = Dataset.load_from_df(dr[['userId', 'movieId', 'rating']], reader)

# Odvajanje baze na teo za treniranje i deo za testiranje
train_set, test_set = train_test_split(data, test_size=.25)

In [None]:
def MODEL_suggest(model, USER_ID):
  predictions = model.test(test_set)
  a = []
  for i in range(NUM_OF_MOVIES):
    a.append(model.predict(19, i))

  npa = np.array(a)
  npa = npa[npa[:, 3].argsort()]

  new_dm = dm[dm['movieId'].isin(npa[:, 1][-10:])].copy()  

  return new_dm

In [None]:
def MODEL_suggest_wo_table(model, test_set):
  predictions = model.test(test_set)
  return predictions 

# SVD model

In [None]:
from surprise import SVD

In [None]:
model_SVD = SVD() #metoda je tako napravljena da se moze staviti bilo koji model

In [None]:
%%time

# Treniranje modela
model_SVD.fit(train_set)

CPU times: user 3.63 s, sys: 7.28 ms, total: 3.64 s
Wall time: 3.66 s


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7ff508274210>

In [None]:
test = MODEL_suggest(model_SVD, USER_ID)

In [None]:
test

Unnamed: 0,movieId,title,(no genres listed),Mystery,Drama,Comedy,Musical,Romance,Fantasy,Documentary,...,Animation,Children,Crime,War,IMAX,Action,Adventure,Film-Noir,Thriller,Sci-Fi
685,903,Vertigo (1958),0,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
704,922,Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
898,1196,Star Wars: Episode V - The Empire Strikes Back...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,1
899,1197,"Princess Bride, The (1987)",0,0,0,1,0,1,1,0,...,0,0,0,0,0,1,1,0,0,0
900,1198,Raiders of the Lost Ark (Indiana Jones and the...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
901,1199,Brazil (1985),0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
906,1204,Lawrence of Arabia (1962),0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
1939,2571,"Matrix, The (1999)",0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,1
2195,2918,Ferris Bueller's Day Off (1986),0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2462,3275,"Boondock Saints, The (2000)",0,0,1,0,0,0,0,0,...,0,0,1,0,0,1,0,0,1,0


# Normal Predictor model

In [None]:
from surprise import NormalPredictor

In [None]:
model_NP = NormalPredictor()

In [None]:
%%time

model_NP.fit(train_set)

CPU times: user 79.2 ms, sys: 828 µs, total: 80.1 ms
Wall time: 85.6 ms


<surprise.prediction_algorithms.random_pred.NormalPredictor at 0x7ff508dc6350>

In [None]:
test_NP = MODEL_suggest(model_NP, USER_ID)

In [None]:
test

Unnamed: 0,movieId,title,(no genres listed),Mystery,Drama,Comedy,Musical,Romance,Fantasy,Documentary,...,Animation,Children,Crime,War,IMAX,Action,Adventure,Film-Noir,Thriller,Sci-Fi
602,750,Dr. Strangelove or: How I Learned to Stop Worr...,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
685,903,Vertigo (1958),0,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
686,904,Rear Window (1954),0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
899,1197,"Princess Bride, The (1987)",0,0,0,1,0,1,1,0,...,0,0,0,0,0,1,1,0,0,0
943,1244,Manhattan (1979),0,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
990,1291,Indiana Jones and the Last Crusade (1989),0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
1057,1374,Star Trek II: The Wrath of Khan (1982),0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,1,1
2226,2959,Fight Club (1999),0,0,1,0,0,0,0,0,...,0,0,1,0,0,1,0,0,1,0
4137,5952,"Lord of the Rings: The Two Towers, The (2002)",0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
4360,6377,Finding Nemo (2003),0,0,0,1,0,0,0,0,...,1,1,0,0,0,0,1,0,0,0


# kNN model

In [None]:
from surprise import KNNBasic

In [None]:
model_KNN = KNNBasic()

In [None]:
model_KNN.fit(train_set)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7ff5070679d0>

In [None]:
test_KNN = MODEL_suggest(model_KNN, USER_ID)

In [None]:
test_KNN

Unnamed: 0,movieId,title,(no genres listed),Mystery,Drama,Comedy,Musical,Romance,Fantasy,Documentary,...,Animation,Children,Crime,War,IMAX,Action,Adventure,Film-Noir,Thriller,Sci-Fi
531,626,"Thin Line Between Love and Hate, A (1996)",0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536,633,Denise Calls Up (1995),0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
666,876,Supercop 2 (Project S) (Chao ji ji hua) (1993),0,0,0,1,0,0,0,0,...,0,0,1,0,0,1,0,0,1,0
865,1140,Entertaining Angels: The Dorothy Day Story (1996),0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
870,1151,Lesson Faust (1994),0,0,1,1,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
1746,2342,Hard Core Logo (1996),0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2597,3473,Jonah Who Will Be 25 in the Year 2000 (Jonas q...,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2611,3496,Madame Sousatzka (1988),0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3031,4055,Panic (2000),0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3908,5490,The Big Bus (1976),0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


# Testiranje vremena

In [None]:
genres_flat_list 

['(no genres listed)',
 'Mystery',
 'Drama',
 'Comedy',
 'Musical',
 'Romance',
 'Fantasy',
 'Documentary',
 'Western',
 'Horror',
 'Animation',
 'Children',
 'Crime',
 'War',
 'IMAX',
 'Action',
 'Adventure',
 'Film-Noir',
 'Thriller',
 'Sci-Fi']

In [None]:
from itertools import combinations

In [None]:
for combo in combinations(['aa', 'bb', 'cc', 'dd'], 2):
    print(list(combo))


['aa', 'bb']
['aa', 'cc']
['aa', 'dd']
['bb', 'cc']
['bb', 'dd']
['cc', 'dd']


In [None]:
len(genres_flat_list)

20

In [None]:
new_genres_flat_list = genres_flat_list[1:]

In [None]:
new_genres_flat_list[0]

'Mystery'

In [None]:
%%time
# n_CB = 0
# for i in range(len(new_genres_flat_list)):
#   for combo in combinations(new_genres_flat_list, i):
#     CB_suggest(list(combo))
#     n_CB += 1
#     print(n_CB)


# print("FINISHED", n_CB)

In [None]:
%%time
CB_suggest(['Drama', 'Romance', 'Comedy'])


CPU times: user 592 ms, sys: 3.7 ms, total: 596 ms
Wall time: 598 ms


Unnamed: 0,movieId,title,Drama,Romance,Comedy,rating,count
314,356,Forrest Gump (1994),1,1,1,4.148444,329
680,898,"Philadelphia Story, The (1940)",1,1,1,4.135825,29
935,1235,Harold and Maude (1971),1,1,1,4.103691,26
1730,2324,Life Is Beautiful (La Vita è bella) (1997),1,1,1,4.094016,88
946,1247,"Graduate, The (1967)",1,1,1,4.011787,79
...,...,...,...,...,...,...,...
475,542,Son in Law (1993),1,1,1,3.046614,14
330,372,Reality Bites (1994),1,1,1,2.966397,21
2859,3825,Coyote Ugly (2000),1,1,1,2.906095,29
1771,2369,Desperately Seeking Susan (1985),1,1,1,2.875797,24


In [None]:
NUM_OF_USERS

In [None]:
%%time
n = 0
for user in range(1, NUM_OF_USERS):
  CF_suggest(user)
  n += 1
  print(n)


In [None]:
AVG_CF_TIME = (14 * 60) / NUM_OF_USERS
print("Average time for CF method:", AVG_CF_TIME, "s")

Average time for CF method: 1.3770491803278688 s


In [None]:
%%time 

for i in range(100):
  prediction_SVD = MODEL_suggest_wo_table(model_SVD, test_set)

CPU times: user 25.8 s, sys: 686 ms, total: 26.5 s
Wall time: 30 s


In [None]:
AVG_SVD_TIME = 30 / 100
print("Average time for SVD model:", AVG_SVD_TIME, "s")

Average time for SVD model: 0.3 s


In [None]:
%%time 
for i in range(100):
  prediction_NP = MODEL_suggest_wo_table(model_NP, test_set)

CPU times: user 23.4 s, sys: 197 ms, total: 23.6 s
Wall time: 25.1 s


In [None]:
AVG_NP_TIME = 25 / 100
print("Average time for SVD model:", AVG_NP_TIME, "s")

Average time for SVD model: 0.25 s


In [None]:
%%time 
for i in range(100):
  prediction_KNN = MODEL_suggest_wo_table(model_KNN, test_set)

CPU times: user 2min 50s, sys: 374 ms, total: 2min 51s
Wall time: 2min 57s


In [None]:
AVG_KNN_TIME = (3 * 60) / 100
print("Average time for SVD model:", AVG_KNN_TIME, "s")

Average time for SVD model: 1.8 s
