In [1]:
import pandas as pd
import numpy as np

In [2]:
movies = pd.read_csv("movies.csv")
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


# One Hot Encoding

In [3]:
genres = ['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime',
          'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
          'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

genre_df = pd.DataFrame(columns=genres, index=movies.index)
genre_df = genre_df.fillna(0)

# Iterate through each movie and set the binary genre indicators
for i, row in movies.iterrows():
    movie_genres = [g.strip() for g in str(row['genres']).split('|')]
    for genre in genres:
        if genre in movie_genres:
            genre_df.at[i, genre] = 1

# Combine the original movies DataFrame with the new genre DataFrame
movies_with_genres = pd.concat([movies, genre_df], axis=1)
movies_with_genres

Unnamed: 0,movieId,title,genres,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,1,1,1,1,0,0,...,1,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,1,0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,0,0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
9739,193585,Flint (2017),Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
data = pd.read_csv("ratings.csv")
data.drop(columns = ['timestamp'],inplace = True)

In [5]:
data

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


# User Rating Normalization

In [6]:
map_of_every_user = {}
for x,y in data.groupby('userId'):
        final_ans = np.array([0]*18)
        for index,row in y.iterrows():
            movie_id = row['movieId']
            rating = row['rating']
            movie_info = movies_with_genres.loc[movies_with_genres['movieId'] == movie_id]
            temp = []
            for index1,row1 in movie_info.iterrows():
                temp = [row1['Action'],row1['Adventure'],row1['Animation'],row1['Children'],row1['Comedy']
                        ,row1['Crime'],row1['Documentary'],row1['Drama'],row1['Fantasy'],row1['Film-Noir']
                        ,row1['Horror'],row1['Musical'],row1['Mystery'],row1['Romance'],row1['Sci-Fi'],row1['Thriller']
                        ,row1['War'],row1['Western']]
                break
            temp = np.array(temp)
            temp = rating*temp
            final_ans = final_ans + temp
        normalized = (final_ans - np.min(final_ans)) / (np.max(final_ans) - np.min(final_ans))
        normalized = np.where(normalized > 0.5, 1, 0)
        map_of_every_user[x] = normalized

# Model Training

In [7]:
X = movies_with_genres.drop(columns = ['title','genres','movieId'])
y = movies_with_genres['movieId']

In [8]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import pandas as pd

clf = MultinomialNB()
clf.fit(X, y)


## Testing for a dummy variable to verify result
X_test = np.vstack((map_of_every_user[1],map_of_every_user[2]))
y_pred = clf.predict(X_test)
y_pred



array([117646,     78], dtype=int64)

In [9]:
movies_with_genres.loc[movies_with_genres['movieId'] == y_pred[1]]

Unnamed: 0,movieId,title,genres,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
70,78,"Crossing Guard, The (1995)",Action|Crime|Drama|Thriller,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


# Recommending movie to user

In [10]:
def recommend_movie_to_user(userId,classifier,map_of_every_user,moviesTorecommend = 10):
    X_test = map_of_every_user[userId]
    X_test = X_test.reshape(-1,X_test.shape[0])
    y_pred_proba = classifier.predict_proba(X_test)
    class_labels = classifier.classes_
    zipped = zip(class_labels,y_pred_proba[0])
    sorted_probs = sorted(zipped, key=lambda x: x[1], reverse=True)
    list_recommendation = []
    for x,y in sorted_probs:
        temp = data.loc[data['userId'] == userId]
        if len(temp.loc[temp['movieId']==x]) == 0:
            list_recommendation.append(x)
        if len(list_recommendation) >= moviesTorecommend:
            break
    
    return list_recommendation
    

In [17]:
def fromidtoname(list_of_movie):
    movies_name = []
    for x in list_of_movie:
        movie_info = movies.loc[movies['movieId'] == x]
        movies_name.append(movie_info['title'])
    return movies_name

In [18]:
list_of_movies = recommend_movie_to_user(1,clf,map_of_every_user)



In [19]:
fromidtoname(list_of_movies)

[8597    Dragonheart 2: A New Beginning (2000)
 Name: title, dtype: object,
 19    Money Train (1995)
 Name: title, dtype: object,
 118    Bad Boys (1995)
 Name: title, dtype: object,
 1103    Metro (1997)
 Name: title, dtype: object,
 1828    Mighty Joe Young (1998)
 Name: title, dtype: object,
 3657    Another 48 Hrs. (1990)
 Name: title, dtype: object,
 3989    Wasabi (2001)
 Name: title, dtype: object,
 4005    Flashback (1990)
 Name: title, dtype: object,
 4176    City of God (Cidade de Deus) (2002)
 Name: title, dtype: object,
 4409    Charlie's Angels: Full Throttle (2003)
 Name: title, dtype: object]