In [8]:
# -*- coding: utf-8 -*-

# Cleaning belongs_to link csv

# import ast

# # taking dictionary out of string type
# belongs_to_link['collection_id'] = belongs_to_link['collection_id'].apply(lambda x: ast.literal_eval(x)['id'])
    
# belongs_to_link.to_csv('final_belongs_to_clean2.csv')

# Static Properties (do not change with input): popularity, ROI, weighted rating

# Dynamic Properties (change according to input): belongs_to, genres, original_language, production_companies, 
# runtime (within 30 min radius + 1 point), release year (within 5 year radius + 1 point), cast, director

# Loading in dataframes and storing as variables

import pandas as pd
import ast
from py2neo import Graph

metadata = pd.read_csv('final_clean_metadata.csv')
belongs_to = pd.read_csv('final_belongs_to_clean2.csv')
ratings = pd.read_csv('ratings.csv')
pd.options.mode.chained_assignment = None

# different approaches in 2 functions because one is comparing movies and one is comparing users so it doesn't
# really make sense to combine 
# used small_ratings.csv so some movies are not included e.g. toystory 

# scores are between 0 and 1, 1 being the same 
# returns panda series

from sklearn.metrics.pairwise import cosine_similarity

def get_movie_rating_recommendations(title, df):
    movieId = df.loc[df['title'] == title, 'id']

    movies = metadata['id'].tolist()
    rating = ratings[ratings['movieId'].isin(movies)]
    df = rating.pivot_table(index=['userId'], columns=['movieId'], values='rating')
    df = df.dropna(thresh=20, axis=1).fillna(0)

    cosine_sim = cosine_similarity(df.T)

    similarity_df = pd.DataFrame(cosine_sim, index=df.columns,columns=df.columns)

    return similarity_df[movieId]

get_movie_rating_recommendations('Toy Story', metadata)


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def get_recommendations(title, df):
    
    count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
    count_matrix = count.fit_transform(df['text'])

    cosine_sim = cosine_similarity(count_matrix, count_matrix)

    df = df.reset_index()
    titles = df['title']
    indices = pd.Series(df.index, index=df['title'])
    
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    return sim_scores

get_recommendations('Toy Story', metadata)

from sklearn import preprocessing

def normalize_static_values(df_column):
    
    x = df_column.values
    x = x.reshape(-1,1)
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df_column = pd.DataFrame(x_scaled)
    
    return df_column

def movietitle_to_movieid(movie_title, df):
    row_index = df['title'].index[df['title'] == movie_title][0]
    return df['id'][row_index]



# combined movie recommendation engine

def recommend_hybrid(movie_title, df1, belongs_to_df):    
    
    if movie_title not in list(df1['title']):
        print('Invalid movie title, try again!')    
        
    else:
        
        # converting supplied movie_title to movie_id
        movie_id = movietitle_to_movieid(movie_title, df1)
    
        # creates a copy of the dataframe to manipulate
        df = df1.copy()
        
        #grabs index of movie's data row
        row_index = df['id'].index[df['id'] == movie_id][0]
        
        
        movie1_points = get_recommendations(movie_title, df)
        df['movie1_points'] = ''
        
        for tup in movie1_points:
            index = tup[0]
            value = tup[1]
            
            if index != row_index:
                df['movie1_points'][index] = value
                
            else:
                df['movie1_points'][index] = -10
                
        
        movie2_points = get_movie_rating_recommendations(movie_title, df)
        df['movie2_points'] = ""
        
        movie2_indexing = movie2_points[movie_id].index.tolist()
        movie2_value = movie2_points[movie_id].tolist()
        movie2_count = 0
        for ind in movie2_indexing:
            df_index = df['id'].index[df['id'] == ind][0]
            
            if ind != movie_id:
                df['movie2_points'][df_index] = movie2_value[movie2_count]
                
            else:
                df['movie2_points'][df_index] = -10
            
            movie2_count += 1
            
        df['movie2_points'].replace("", 0, inplace = True)
                        
        # Normalizing static value columns in new dataframe
        df['popularity'] = normalize_static_values(df['popularity'])
        df['ROI'] = normalize_static_values(df['ROI'])
        df['weighted_rating'] = normalize_static_values(df['weighted_rating'])
        
        # Creating new column to aggregate normalized static values
        df['total_static_value'] = df['popularity'] + df['ROI'] + df['weighted_rating']

        # creates dictionary of metadata specific to supplied movie_id
        metadata = {}
        metadata['belongs_to_collection'] = df.iloc[row_index]['belongs_to_collection']
        metadata['genres'] = ast.literal_eval(df.iloc[row_index]['genres'])
        metadata['production_companies'] = ast.literal_eval(df.iloc[row_index]['production_companies'])
        metadata['runtime'] = int(df.iloc[row_index]['runtime'])
        metadata['title'] = df.iloc[row_index]['title']
        metadata['release_year'] = int(df.iloc[row_index]['release_year'])
        metadata['cast'] = ast.literal_eval(df.iloc[row_index]['cast'])
        metadata['director'] = df.iloc[row_index]['director']

        # creating column of point counters starting at zero; point(s) added everytime dynamic property matches 
        df['dp_counter'] = pd.Series([0 for x in range(len(df.index))], index = df.index)

        # retrieving related movies if it belongs to a collection (sequels, etc.)
        if metadata['belongs_to_collection'] == 1: 
            belong_row_index = belongs_to_df['id'].index[belongs_to_df['id'] == movie_id][0]
            collection_id = belongs_to_df['collection_id'][belong_row_index]

            belong_counter = 0       # to keep track of index in belong_df
            related_movie_l = []

            for val in belongs_to_df['collection_id']: # to get index of related movie in belong_df
                if val == collection_id:
                    related_movie_l.append(belong_counter)
                belong_counter += 1

            # adding five points to related movie's dp counter (movie collection is weighted the most)
            for ind in related_movie_l:
                related_movie = belongs_to_df['id'][ind]
                if related_movie != movie_id:
                    related_movie_row_index = df['id'].index[df['id'] == related_movie][0]
                    df['dp_counter'][related_movie_row_index] += 5 

        # adding point(s) for genre (1 point for every genre that matches, if any)
        genre_counter = 0 # to keep track of index
        for genre in df['genres']:
            genre = ast.literal_eval(genre)
            for element in genre:
                if genre_counter != row_index:
                    if element in metadata['genres']:
                        df['dp_counter'][genre_counter] += 1
            genre_counter += 1

        # adding point(s) for production company (1 point for every company that matches, if any)
        production_counter = 0
        for production_co in df['production_companies']:
            production_co = ast.literal_eval(production_co)
            for element in production_co:
                if production_counter != row_index:
                    if element in metadata['production_companies']:
                        df['dp_counter'][production_counter] += 1
            production_counter += 1

        # adding point(s) for cast members (1 point for every cast member that matches, if any)
        cast_counter = 0
        for cast in df['cast']:
            cast = ast.literal_eval(cast)
            for element in cast:
                if cast_counter != row_index:
                    if element in metadata['cast']:
                        df['dp_counter'][cast_counter] += 1
            cast_counter += 1

        # adding 3 points if director matches
        director_counter = 0
        for director in df['director']:
            if director_counter != row_index:
                if director == metadata['director']:
                    df['dp_counter'][director_counter] += 3
            director_counter += 1

        # add 1 point if movie within 15 min runtime radius
        runtime_counter = 0
        for runtime in df['runtime']:
            runtime = int(runtime)
            if runtime_counter != row_index:
                if runtime in list(range(metadata['runtime'] - 15, metadata['runtime'] + 15)):
                    df['dp_counter'][runtime_counter] += 1
            runtime_counter += 1

        # add 1 point if movie within 3 year release_year radius
        release_year_counter = 0
        for release_year in df['release_year']:
            release_year = int(release_year)
            if release_year_counter != row_index:
                if release_year in list(range(metadata['release_year'] - 3, metadata['release_year'] + 3)):
                    df['dp_counter'][release_year_counter] += 1
            release_year_counter += 1     
        
        df['total_point_value'] = df['total_static_value'] + df['dp_counter']
        
        # Normalizing static value columns in new dataframe
        df['total_point_value'] = normalize_static_values(df['total_point_value'])
        
        df['total_content_point_value'] = df['total_point_value'] + df['movie1_points']
        
#         df['total_content_point_value'] = normalize_static_values(df['total_content_point_value'])
        
        df['final_point_value'] = df['total_content_point_value'] + df['movie2_points']
        
        df.sort_values(['final_point_value'], ascending=False, inplace=True)
        return df['title'][:10]
#         df.to_csv('testest12321.csv')



# approach which takes in a user and gives the movie recommendations using Neo4j
pd.set_option('expand_frame_repr', True)
pd.set_option("display.max_rows", 100)
pd.set_option('max_colwidth',100)

# graph = Graph("http://neo4j:123@localhost:7474/db/data")
# graph.run("load CSV with headers from 'file:///ratingsview.csv' as row merge(m:Movie {movieid:row.movieId, title:row.title}) merge(u:User {userid:row.userId}) merge(u)-[:Rated {rating:row.rating}]-> (m) Return m, u")


# def user_recommendation(userid):

#     knnquery = 'MATCH (u1:User {userid: $userid })-[r:Rated]->(m:Movie) WITH u1, avg(r.rating) AS u1_avg MATCH (u1)-[r1:Rated]->(m:Movie)<-[r2:Rated]-(u2) WITH u1, u1_avg, u2, COLLECT({r1: r1, r2: r2}) AS ratings WHERE size(ratings) > 10 MATCH (u2)-[r:Rated]->(m:Movie) WITH u1, u1_avg, u2, avg(r.rating) AS u2_avg, ratings UNWIND ratings AS r WITH sum( (r.r1.rating-u1_avg) * (r.r2.rating-u2_avg) ) AS nom, sqrt( sum( (r.r1.rating - u1_avg)^2) * sum( (r.r2.rating - u2_avg) ^2)) AS denom, u1, u2 WHERE denom <> 0 WITH u1, u2, nom/denom AS pearson ORDER BY pearson DESC LIMIT 10 MATCH (u2)-[r:Rated]->(m:Movie) WHERE NOT EXISTS( (u1)-[:Rated]->(m) ) RETURN m.title, SUM( pearson * r.rating) AS score ORDER BY score DESC LIMIT 25'
    
#     result = graph.run(knnquery, userid=userid)
    
#     recommendation = []
#     score = []
    
#     for node in result:
#         recommendation.append(node[0])
#         score.append(node[1])
    
#     recommendations = pd.DataFrame(columns = ["Movie", "Score"])
    
#     for i in range(0, len(recommendation)):
#         recommendations = recommendations.append({"Movie":recommendation[i], 
#                                                   "Score":score[i]}, ignore_index=True)
    
#     print(recommendations)
    
# user_recommendation(168)

In [9]:
recommend_hybrid('Toy Story', metadata, belongs_to)

1112           Toy Story 2
833           A Bug's Life
2490                  Cars
3059           Toy Story 3
3218                Cars 2
1624        Monsters, Inc.
1479                 Shrek
230              Space Jam
1695               Ice Age
2654    The Simpsons Movie
Name: title, dtype: object