[Reference](https://ai.plainenglish.io/tmdb-streamlit-build-your-own-movie-recommendation-system-f2ffbca63d11)

In [1]:
# import deps
import pandas as pd

# load datasets
df_cred = pd.read_csv("tmdb_5000_credits.csv")
df_mov = pd.read_csv("tmdb_5000_movies.csv")

# See the size of data sets
df_cred.shape, df_mov.shape

(df_cred.movie_id != df_mov.id).any().sum()

# rename column name
df_cred.rename(columns = {'movie_id':'id'}, inplace = True)

# merge the two dataframes & store in df
df = df_cred.merge(df_mov, on = 'id')

# Lets take a look at the data type of the columns
df.info()

# drop null overviews
df.dropna(subset = ['overview'], inplace=True)

# filter out target columns
df = df[['id', 'title_x', 'genres', 'overview', 'cast', 'crew']]

# check new df info
df.info()

# Genres
df.genres[0]

' '.join([i['name'] for i in eval(df.genres[0])])

# taking top 3 cast
' '.join([i['name'] for i in eval(df_feat.cast[0])[:3]])

# taking crew (director & producer)
' '.join(list(set([i['name'] for i in eval(df_feat.crew[0]) if i['job']=='Director' or i['job']=='Producer'])))

# function to generate corpus
def generate_corpus(overview, genre, cast, crew):

    corpus = ""

    genre = ' '.join([i['name'] for i in eval(genre)])

    cast = ' '.join([i['name'] for i in eval(cast)[:3]])

    crew = ' '.join(list(set([i['name'] for i in eval(crew) if i['job']=='Director' or i['job']=='Producer'])))

    corpus+= overview + " " + genre + " " + cast + " " + crew

    return corpus


corpus = []
for i in range(len(df)):
    corpus.append(generate_corpus(df.iloc[i].overview, df.iloc[i].genres, df.iloc[i].cast, df.iloc[i].crew))

len(corpus)

print(corpus[0])

# rename the column
df.rename(columns = {'title_x':'title'}, inplace = True)

# drop old columns
df.drop(columns=['genres', 'overview', 'cast', 'crew'], inplace=True)

# add corpus
df['corpus'] = corpus

# Define three vectors
A = [1, 2]
B = [2, 3]
C = [3, 1]

# Calculate ot products
ab = np.dot(A,B)
bc = np.dot(B,C)
ca = np.dot(C,A)

# calculate the length of the vector
a = np.linalg.norm(A)
b = np.linalg.norm(B)
c = np.linalg.norm(C)

# calculte cosine similarity for each pair using the above formula
sim_ab = ab/(a*b)
sim_bc = bc/(b*c)
sim_ca = ca/(c*a)

# lets see the similarities
sim_ab, sim_bc, sim_ca

# import class
from sklearn.metrics.pairwise import cosine_similarity

# compute cosine similarity
cosine_similarity([A, B, C])

pd.DataFrame(cosine_similarity(data),
             columns=['A', 'B', 'C'],
             index=['A', 'B', 'C'])

# import deps
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the Object and remove stopwords
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['corpus'])

# compare shapes
df.shape

tfidf_matrix.shape

# import deps
from sklearn.metrics.pairwise import linear_kernel

# compute the similarity matirx
cos_mat = linear_kernel(tfidf_matrix, tfidf_matrix)

cos_mat.shape

diag = 0
for i in range(len(cos_mat)):
    diag+= cos_mat[i][i]

print(diag)

def get_recommendations(movie, n):

    # get index from dataframe
    index = df[df['title']== movie].index[0]

    # sort top n similar movies
    similar_movies = sorted(list(enumerate(cos_mat[index])), reverse=True, key=lambda x: x[1])

    # extract names from dataframe and return movie names
    recomm = []
    for i in similar_movies[1:n+1]:
        recomm.append(df.iloc[i[0]].title)

    return recomm


# lets test the function
get_recommendations("The Dark Knight", 3)

get_recommendations("Mission: Impossible", 3)

def get_keywords_recommendations(keywords, n):

    keywords = keywords.split()
    keywords = " ".join(keywords)

    # transform the string to vector representation
    key_tfidf = tfidf.transform([keywords])

    # compute cosine similarity
    result = cosine_similarity(key_tfidf, cos_mat)

    # sort top n similar movies
    similar_key_movies = sorted(list(enumerate(result[0])), reverse=True, key=lambda x: x[1])

    # extract names from dataframe and return movie names
    recomm = []
    for i in similar_key_movies[1:n+1]:
        recomm.append(df.iloc[i[0]].title)

    return recomm

# Let's test it out
get_keywords_recommendations("Christopher Nolan", 4)

import joblib

joblib.dump(df, 'models/movie_db.df')
joblib.dump(cos_mat, 'models/cos_mat.mt')
joblib.dump(tfidf, 'models/vectorizer.tf')
joblib.dump(tfidf_matrix, 'models/tfidf_mat.tf')


if search_type == 'Movie Title':
    st.subheader("Select Movie 🎬")
    movie_name = st.selectbox('', df.title)
    if st.button('Recommend 🚀'):
        with st.spinner('Wait for it...'):
            movies = get_recommendations(movie_name)
            posters = fetch_poster(movies)
else:
    st.subheader('Enter Cast / Crew / Tags / Genre  🌟')
    keyword = st.text_input('', 'Christopher Nolan')
    if st.button('Recommend 🚀'):
        with st.spinner('Wait for it...'):
            movies = get_keywords_recommendations(keyword)
            posters = fetch_poster(movies)