In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
movies =pd.read_csv(r'movies.csv', sep=';')
movies_df = pd.DataFrame(movies,columns=['movieId','title','genres'])


ratings =pd.read_csv(r'ratings.csv', sep=';')
ratings_df = pd.DataFrame(ratings)

users =pd.read_csv(r'users.csv', sep=';')
users_df = pd.DataFrame(users)

In [3]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [5]:
users_df.head()

Unnamed: 0,userId,gender,age,occupation,zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [6]:
#movie_D_F preprossing
x=[]
for index, row in movies_df.iterrows():
    y= row['title'].split(' (')
    y= y[0].split(',')
    x.append(y[0].lower())


movies_df['title']=x
movies_df['genres'] = movies_df['genres'].map(lambda x: x.split('|'))


In [7]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,toy story,"[Animation, Children's, Comedy]"
1,2,jumanji,"[Adventure, Children's, Fantasy]"
2,3,grumpier old men,"[Comedy, Romance]"
3,4,waiting to exhale,"[Comedy, Drama]"
4,5,father of the bride part ii,[Comedy]


In [8]:
movies_df['Bag_of_words'] = ''
column='genres'
geners=[]
for index, row in movies_df.iterrows():
    words = ''
    words += ' '.join(row[column]) + ' '
    geners.append(words)

movies_df['Bag_of_words']=geners
df = movies_df[['title','Bag_of_words']]
df.head()

Unnamed: 0,title,Bag_of_words
0,toy story,Animation Children's Comedy
1,jumanji,Adventure Children's Fantasy
2,grumpier old men,Comedy Romance
3,waiting to exhale,Comedy Drama
4,father of the bride part ii,Comedy


In [9]:
count1 = CountVectorizer()
count_matrix1 = count1.fit_transform(df['Bag_of_words'])

similarity1 = cosine_similarity(count_matrix1, count_matrix1)
print(similarity1)
indices = pd.Series(df['title'])
indices

[[1.         0.33333333 0.40824829 ... 0.         0.         0.        ]
 [0.33333333 1.         0.         ... 0.         0.         0.        ]
 [0.40824829 0.         1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         1.         0.70710678]
 [0.         0.         0.         ... 1.         1.         0.70710678]
 [0.         0.         0.         ... 0.70710678 0.70710678 1.        ]]


0                         toy story
1                           jumanji
2                  grumpier old men
3                 waiting to exhale
4       father of the bride part ii
                   ...             
3878               meet the parents
3879            requiem for a dream
3880                      tigerland
3881               two family house
3882                      contender
Name: title, Length: 3883, dtype: object

In [10]:
def recommend_by_movies(title, N):
    
    recommended_movies = []
    idx = indices[indices == title].index[0]
    score_series = pd.Series(similarity1[idx]).sort_values(ascending = False)
    top_N_indices = score_series.iloc[0:].index
        
    for i in top_N_indices:
        title=get_max_rate_movie(df2['userId'][i])
        
        if len(recommended_movies)==N:
            break
        if i==idx or title in recommended_movies:
            continue
            
        recommended_movies.append(df['title'][i])
    return recommended_movies


In [11]:
users_df['Bag_of_words'] = ''
columns = ['gender', 'age', 'occupation']
persons=[]
for index, row in users_df.iterrows():
    words = ''
    if row['gender']=='F':
        row['gender']=1
    else:
        row['gender']=-1

    for col in columns:
        words += (str(row[col])) + ' '
    persons.append(words)
    
users_df['Bag_of_words']=persons
df2 = users_df[['userId','Bag_of_words']]
df2.head()

Unnamed: 0,userId,Bag_of_words
0,1,1 1 10
1,2,-1 56 16
2,3,-1 25 15
3,4,-1 45 7
4,5,-1 25 20


In [12]:
count = CountVectorizer()
count_matrix = count.fit_transform(df2['Bag_of_words'])
similarity = cosine_similarity(count_matrix, count_matrix)
print(similarity)
indices_user_id = pd.Series(df2['userId'])


[[1.         0.         0.         ... 0.         0.         0.        ]
 [0.         1.         0.         ... 0.70710678 0.         0.        ]
 [0.         0.         1.         ... 0.         0.         0.70710678]
 ...
 [0.         0.70710678 0.         ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.         0.         0.70710678 ... 0.         0.         1.        ]]


In [13]:
def get_max_rate_movie(user_id):
    recommended_movie=''
    movie_id = ratings_df.loc[(ratings_df['userId']==user_id) & (ratings_df.rating==ratings_df.rating.max()), 'movieId'].iloc[0]
    recommended_movie = movies_df.loc[movies_df['movieId'] == movie_id, 'title'].iloc[0]
    return recommended_movie


In [14]:
def get_max_rate_users(movie_id):
    users = ratings_df.loc[(ratings_df['movieId']==movie_id) & (ratings_df.rating==ratings_df.rating.max()), 'userId'].iloc[0:100]
    recommended_user=[]
    for user_id in users:
        recommended_user.append(users_df.loc[users_df['userId'] == user_id, 'userId'].iloc[0])
    return recommended_user


In [15]:
def recommend_by_users(userId,N):
    
    recommended_movies= []
    idx = indices_user_id[indices_user_id == userId].index[0]
    score_series = pd.Series(similarity[idx]).sort_values(ascending = False)
    top_N_indices = score_series.iloc[0:].index

    for i in top_N_indices:
        title=get_max_rate_movie(df2['userId'][i])
        
        if len(recommended_movies)==N:
            break
        if i==idx or title in recommended_movies:
            continue    
        recommended_movies.append(title)            
    return recommended_movies


In [16]:
def recommend_by_movie2(movie,N):
    movie_id=movies_df.loc[(movies_df['title']==movie) , 'movieId'].iloc[0]
    users=get_max_rate_users(movie_id)
    recommended_movies=[]
    for user_id in users:
        title = get_max_rate_movie(user_id)
        if len(recommended_movies)==N:
            break
        if title in recommended_movies:
            continue 
        recommended_movies.append(title)
    return recommended_movies


In [17]:
#testing:
recommend_by_movies('adventures of rocky and bullwinkle',5)

['toy story', 'saludos amigos', 'rugrats movie', "bug's life", 'toy story 2']

In [18]:
recommend_by_movie2('jumanji',5)

["midsummer night's dream",
 'who framed roger rabbit?',
 'jumanji',
 'x-men',
 'bridge on the river kwai']

In [19]:
recommend_by_users(3,5)

['terminator 2',
 'maltese falcon',
 'who framed roger rabbit?',
 'nikita',
 'being john malkovich']