In [1]:
# Streamlit installation for pretty UI
!pip -q install streamlit 

# Localtunnel installation to test it locally
!npm install -g localtunnel > /dev/null

[K     |████████████████████████████████| 9.7 MB 5.1 MB/s 
[K     |████████████████████████████████| 4.3 MB 59.0 MB/s 
[K     |████████████████████████████████| 111 kB 56.7 MB/s 
[K     |████████████████████████████████| 76 kB 4.3 MB/s 
[K     |████████████████████████████████| 164 kB 68.3 MB/s 
[K     |████████████████████████████████| 180 kB 69.8 MB/s 
[K     |████████████████████████████████| 63 kB 1.3 MB/s 
[K     |████████████████████████████████| 128 kB 70.7 MB/s 
[K     |████████████████████████████████| 792 kB 59.0 MB/s 
[K     |████████████████████████████████| 380 kB 73.3 MB/s 
[?25h  Building wheel for blinker (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jupyter-console 5.2.0 requires prompt-toolkit<2.0.0,>=1.0.0, but you have prompt-toolkit 3.0.28 which is incompatible.
google-colab 1.0.0 requires i

In [2]:
!pip install fuzzywuzzy IMDbPy # Fuzzywuzzy to make our UI pretty

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Collecting IMDbPy
  Downloading IMDbPY-2021.4.18-py3-none-any.whl (298 kB)
[K     |████████████████████████████████| 298 kB 5.4 MB/s 
Installing collected packages: IMDbPy, fuzzywuzzy
Successfully installed IMDbPy-2021.4.18 fuzzywuzzy-0.18.0


In [3]:
# Our movies list :

import pandas as pd
movies = pd.read_csv('df_initial.csv')
movie_names = {movie.title: movie.imdbId for index, movie in movies[['imdbId', 'title']].drop_duplicates().iterrows()}
movie_names

{'Toy Story (1995)': 'tt0114709',
 'Jumanji (1995)': 'tt0113497',
 'Grumpier Old Men (1995)': 'tt0113228',
 'Waiting to Exhale (1995)': 'tt0114885',
 'Father of the Bride Part II (1995)': 'tt0113041',
 'Heat (1995)': 'tt0113277',
 'Sabrina (1995)': 'tt0114319',
 'Tom and Huck (1995)': 'tt0112302',
 'Sudden Death (1995)': 'tt0114576',
 'GoldenEye (1995)': 'tt0113189',
 'American President, The (1995)': 'tt0112346',
 'Dracula: Dead and Loving It (1995)': 'tt0112896',
 'Balto (1995)': 'tt0112453',
 'Nixon (1995)': 'tt0113987',
 'Cutthroat Island (1995)': 'tt0112760',
 'Casino (1995)': 'tt0112641',
 'Sense and Sensibility (1995)': 'tt0114388',
 'Four Rooms (1995)': 'tt0113101',
 'Ace Ventura: When Nature Calls (1995)': 'tt0112281',
 'Money Train (1995)': 'tt0113845',
 'Get Shorty (1995)': 'tt0113161',
 'Copycat (1995)': 'tt0112722',
 'Assassins (1995)': 'tt0112401',
 'Powder (1995)': 'tt0114168',
 'Leaving Las Vegas (1995)': 'tt0113627',
 'Othello (1995)': 'tt0114057',
 'Now and Then (1995

In [4]:
# Saving our movies list and display it after :
import pickle
with open('movie_names.pkl', 'wb') as f:
    pickle.dump(movie_names, f)

In [5]:
# Our Recommendation system building code : 
import pandas as pd
import numpy as np

from math import pow, sqrt
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from fuzzywuzzy import process

df12 = pd.read_csv("df_initial.csv", sep=',' )
df12= df12[~pd.isna(df12['movieId'])]
df12= df12[~pd.isna(df12['castId'])]
df12.drop_duplicates(inplace=True)

movieId_map = df12[['movieId', 'title']].drop_duplicates()
castId_map = df12[['castId', 'primaryName']].drop_duplicates()

movieIdMap = dict(zip(movieId_map['movieId'],movieId_map['title']))
castIdMap = dict(zip(castId_map['castId'],castId_map['primaryName']))

df12.drop(["title", "primaryName",  "imdbId",  "category"], axis=1, inplace=True)

df_pivot = pd.DataFrame(df12.groupby(['movieId'])['castId'].apply(list))
df_pivot['castId'] = df_pivot.apply(lambda row: ' '.join(row['castId']) , axis=1)
movieIds = list(df_pivot.index)
castIds = list(df_pivot['castId'])
vectorizer = CountVectorizer().fit_transform(castIds)
vectors = vectorizer.toarray()

# Caluclate the cosine similarities
csim = cosine_similarity(vectors)

# finally save the cosine similarities which will only be loaded in app at the beginning
with open('cosines.pkl', 'wb') as f:
    pickle.dump(csim, f)



In [6]:
#Streamlit Application
%%writefile app.py

import streamlit as st
import pandas as pd
import numpy as np

from math import pow, sqrt
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from fuzzywuzzy import process

import pickle
from imdb import IMDb

def recommend_movies(title_input, csim, movies, number_of_rec = 20):
    """
    title_input - title of the movie
    movies - dictionary of movie names and their Imdb ID
    """

    df12 = pd.read_csv("df_initial.csv", sep=',' )
    df12= df12[~pd.isna(df12['movieId'])]
    df12= df12[~pd.isna(df12['castId'])]
    df12.drop_duplicates(inplace=True)

    movieId_map = df12[['movieId', 'title']].drop_duplicates()
    castId_map = df12[['castId', 'primaryName']].drop_duplicates()

    movieIdMap = dict(zip(movieId_map['movieId'],movieId_map['title']))
    castIdMap = dict(zip(castId_map['castId'],castId_map['primaryName']))

    df12.drop(["title", "primaryName",  "imdbId",  "category"], axis=1, inplace=True)

    df_pivot = pd.DataFrame(df12.groupby(['movieId'])['castId'].apply(list))
    df_pivot['castId'] = df_pivot.apply(lambda row: ' '.join(row['castId']) , axis=1)
    movieIds = list(df_pivot.index)
    castIds = list(df_pivot['castId'])

    df_final = pd.DataFrame(csim, index=movieIds, columns=movieIds)
    movieTitles_list = list(movieId_map['title'])
    title_match = process.extractOne(title_input, movieTitles_list)[0]
    #st.subheader(f'Getting recommendation for {title_match}')

    primary_cast_bonus = .15  
    title = dict((v,k) for k,v in movieIdMap.items())
    movie_id_alpha = title[title_match] 
    if movie_id_alpha not in df_final.columns:
        st.write('Your film is not available. Try another one please')       #If the movie is not present on the platform
        movieFound = False  

    else:
        filtered_df = df_final[movie_id_alpha]
        recommendation = filtered_df.sort_values(ascending=False).head(number_of_rec+11)
        recommendation = pd.DataFrame(recommendation).reset_index(drop=False).rename(columns={'index':'movieId'}).merge(df_pivot, how='left', on=['movieId'])
        primary_cast_id = recommendation[recommendation['movieId'] == movie_id_alpha].iloc[0]['castId'].split()[0]  # gets the castId of the actor appearing first in the cast list
        recommendation = recommendation[recommendation['movieId'] != movie_id_alpha]   # drops the exact movie match
        recommendation[movie_id_alpha] = recommendation.apply(lambda row: row[movie_id_alpha] + primary_cast_bonus if primary_cast_id in row['castId'] else row[movie_id_alpha] , axis=1) # this checks the cast of each movie. If the primary actor is in it, it adds the "bonus" to the match score
        recommendation = recommendation.sort_values(by=movie_id_alpha , ascending=False).head(number_of_rec+1) # re-sorts the movies after adding any "bonus"
        movieFound = True

    if movieFound == True:
        recommendation['movieId'] = recommendation['movieId'].map(movieIdMap).fillna('N/A')
        recommendation['castId'] = recommendation['castId'].str.split()
        recommendation['castId'] = recommendation.apply(lambda row: ' . '.join([str(castIdMap[x]) for x in row['castId']]), axis=1) 
        

        # show recommended movies 
        st.subheader(f'Movies similar to - {title_input} based on cast and director:')

        # load IMDb to fetch cover posters
        imdb = IMDb()
        recommended_movies = recommendation['movieId'].unique().tolist()

        # display recommended movies in columns
        columns = [st.columns(4) for _ in range(5)] # 5 rows, 4 columns
        
        # segment movies too into columns 
        movie_column = []
        all_movies = []
        for idx, movie in enumerate(recommended_movies):
            if idx % 4 == 0:
                if movie_column:
                    all_movies.append(movie_column)
                    movie_column = []

            movie_column.append(movie)
        
        # display time!
        for movie_cols, cols in zip(all_movies, columns):
            for movie, column in zip(movie_cols, cols):
                if movie != 'N/A':
                    movie_id = movies[movie]
                    movie = imdb.get_movie(movie_id.lstrip('tt'))
                    
                    if 'cover url' in movie:
                        with column:
                            st.image(movie['cover url'], use_column_width=True)

        st.dataframe(recommendation)

def main():
    # load list of movie names
    with open('movie_names.pkl', 'rb') as f:
        movies = pickle.load(f)

    # read cosine similarity scores that we saved earlier
    with open('cosines.pkl', 'rb') as f:
        csim = pickle.load(f)

    # adding a blank value as default
    movie_names = list(movies.keys())
    movie_names.insert(0, '')

    # show pretty UI
    st.title('Movie Recommendation System')
    title_input = st.selectbox('Movie Name:', movie_names)
    
    if title_input:
        with st.spinner(f'Getting Recommendations for {title_input}'):
            recommend_movies(title_input, csim, movies)
            st.balloons()

    else:
        st.subheader('Enter a movie title to get started...')
 
    
if __name__ == '__main__':
    main()

Writing app.py


In [7]:
# run streamlit
!streamlit run app.py --server.enableCORS=false &>/dev/null&
!lt --Bypass-Tunnel-Reminder --subdomain 'meow' --port 8501 

^C


In [8]:
# kill app and clean up memory
st_id = !pgrep streamlit
!kill {st_id[0]}

lt_id = !pgrep lt
!kill {lt_id[0]}