In [3]:
import numpy as np
import pandas as pd
from flask import Flask, render_template, request
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json
import bs4 as bs
import urllib.request
import pickle
import requests

In [5]:
# load the nlp model and tfidf vectorizer from disk
clf = pickle.load(open('nlp_model.pkl', 'rb'))
vectorizer = pickle.load(open('tranform.pkl','rb'))

In [6]:
# function to create the similarity matrix
def create_similarity():
    data = pd.read_csv('main_data.csv')
    # creating a count matrix
    cv = CountVectorizer()
    count_matrix = cv.fit_transform(data['comb'])
    # creating a similarity score matrix
    similarity = cosine_similarity(count_matrix)
    return data,similarity

In [1]:
# function to create recommendation
def find_similar_movies(movie):
    movie = movie.lower()
    # check if data and similarity are already assigned
    try:
        data.head()
        similarity.shape
    except:
        data, similarity = create_similarity()
    # check if the movie is in our database or not
    if movie not in data['movie_title'].unique():
        return ('Sorry! The movie you requested is not in our database. Please check the spelling or try with some other movies')
    else:
        # getting the index of the movie in the dataframe
        i = data.loc[data['movie_title'] == movie].index[0]

        # fetching the row containing similarity scores of the movie from similarity matrix and enumerate it
        similarity_list = list(enumerate(similarity[i]))

        # sorting this list in decreasing order based on the similarity score
        sorted_similarity = sorted(similarity_list, key = lambda x:x[1] ,reverse=True)

        # taking top 10 movie scores, not taking the first index since it is the same movie
        sorted_similarity = sorted_similarity[1:11]

        # making an empty list that will containg all 10 movie recommendations
        recommended_list = []
        for i in range(len(sorted_similarity)):
            a = sorted_similarity[i][0]
            recommended_list.append(data['movie_title'][a])
        
        return recommended_list

In [2]:
# converting list of string to list (eg. "["abc","def"]" to ["abc","def"])
def convert_to_list(my_list):
    my_list = my_list.split('","')
    my_list[0] = my_list[0].replace('["','')
    my_list[-1] = my_list[-1].replace('"]','')
    return my_list