In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
imdb_data = pd.read_csv('imdb_data.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [5]:
imdb_data.head()

Unnamed: 0,movieId,title_cast,director,runtime,budget,plot_keywords
0,1,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter,81.0,"$30,000,000",toy|rivalry|cowboy|cgi animation
1,2,Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...,Jonathan Hensleigh,104.0,"$65,000,000",board game|adventurer|fight|game
2,3,Walter Matthau|Jack Lemmon|Sophia Loren|Ann-Ma...,Mark Steven Johnson,101.0,"$25,000,000",boat|lake|neighbor|rivalry
3,4,Whitney Houston|Angela Bassett|Loretta Devine|...,Terry McMillan,124.0,"$16,000,000",black american|husband wife relationship|betra...
4,5,Steve Martin|Diane Keaton|Martin Short|Kimberl...,Albert Hackett,106.0,"$30,000,000",fatherhood|doberman|dog|mansion


In [6]:
# merging dataframes to extend movie attributes
movies_df = imdb_data.merge(movies, left_on='movieId', right_on='movieId')

In [7]:
movies_df.isna().sum()

movieId              0
title_cast        9665
director          9519
runtime          11345
budget           17583
plot_keywords    10482
title                0
genres               0
dtype: int64

## Pre-processing

In [8]:
# dropping unnccessary columns
movies_df.drop(['runtime','budget','movieId'],axis=1,inplace=True)

In [9]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24866 entries, 0 to 24865
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   title_cast     15201 non-null  object
 1   director       15347 non-null  object
 2   plot_keywords  14384 non-null  object
 3   title          24866 non-null  object
 4   genres         24866 non-null  object
dtypes: object(5)
memory usage: 1.1+ MB


In [10]:
def to_string(df):
    for col in df.columns:
        if df[col].dtype in ['int64','float','object']:
            df[col] = df[col].astype(str)
    return df

In [11]:
df_1= to_string(movies_df)

In [12]:
df_1.head(2)

Unnamed: 0,title_cast,director,plot_keywords,title,genres
0,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter,toy|rivalry|cowboy|cgi animation,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...,Jonathan Hensleigh,board game|adventurer|fight|game,Jumanji (1995),Adventure|Children|Fantasy


In [13]:
df_1['director'] = df_1['director'].apply(lambda x: "".join(x.lower() for x in x.split()))

In [14]:
df_1['title_cast'] = df_1['title_cast'].apply(lambda x: "".join(x.lower() for x in x.split()))
df_1['title_cast'] = df_1['title_cast'].map(lambda x: x.split('|')[:3])

In [15]:
df_1['plot_keywords'] = df_1['plot_keywords'].map(lambda x: x.split('|')[:5])
df_1['plot_keywords'] = df_1['plot_keywords'].apply(lambda x: " ".join(x))

In [16]:
#df_1['genres'] = df_1.genres.str.split('|')

# Discarding the pipes between the genres 
df_1['genres'] = df_1['genres'].map(lambda x: x.lower().split('|'))
df_1['genres'] = df_1['genres'].apply(lambda x: " ".join(x))

In [17]:
df_1.head()

Unnamed: 0,title_cast,director,plot_keywords,title,genres
0,"[tomhanks, timallen, donrickles]",johnlasseter,toy rivalry cowboy cgi animation,Toy Story (1995),adventure animation children comedy fantasy
1,"[robinwilliams, jonathanhyde, kirstendunst]",jonathanhensleigh,board game adventurer fight game,Jumanji (1995),adventure children fantasy
2,"[waltermatthau, jacklemmon, sophialoren]",markstevenjohnson,boat lake neighbor rivalry,Grumpier Old Men (1995),comedy romance
3,"[whitneyhouston, angelabassett, lorettadevine]",terrymcmillan,black american husband wife relationship betra...,Waiting to Exhale (1995),comedy drama romance
4,"[stevemartin, dianekeaton, martinshort]",alberthackett,fatherhood doberman dog mansion,Father of the Bride Part II (1995),comedy


In [18]:
df_1.set_index('title', inplace = True)
df_1.head()

Unnamed: 0_level_0,title_cast,director,plot_keywords,genres
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Toy Story (1995),"[tomhanks, timallen, donrickles]",johnlasseter,toy rivalry cowboy cgi animation,adventure animation children comedy fantasy
Jumanji (1995),"[robinwilliams, jonathanhyde, kirstendunst]",jonathanhensleigh,board game adventurer fight game,adventure children fantasy
Grumpier Old Men (1995),"[waltermatthau, jacklemmon, sophialoren]",markstevenjohnson,boat lake neighbor rivalry,comedy romance
Waiting to Exhale (1995),"[whitneyhouston, angelabassett, lorettadevine]",terrymcmillan,black american husband wife relationship betra...,comedy drama romance
Father of the Bride Part II (1995),"[stevemartin, dianekeaton, martinshort]",alberthackett,fatherhood doberman dog mansion,comedy


In [19]:
df_1['KeyWords'] = ''
columns = df_1.columns
for index, row in df_1.iterrows():
    words = ''
    for col in columns:
        if col not in ['director','plot_keywords','genres']:
            words = words + ' '.join(row[col])+ ' '
        else:
            words = words + row[col]+ ' '
    row['KeyWords'] = words


In [21]:
df_1['KeyWords'][1]

'robinwilliams jonathanhyde kirstendunst jonathanhensleigh board game adventurer fight game adventure children fantasy  '

In [25]:
df_1.reset_index(inplace=True)

In [26]:
df_1.head()

Unnamed: 0,title,title_cast,director,plot_keywords,genres,KeyWords
0,Toy Story (1995),"[tomhanks, timallen, donrickles]",johnlasseter,toy rivalry cowboy cgi animation,adventure animation children comedy fantasy,tomhanks timallen donrickles johnlasseter toy ...
1,Jumanji (1995),"[robinwilliams, jonathanhyde, kirstendunst]",jonathanhensleigh,board game adventurer fight game,adventure children fantasy,robinwilliams jonathanhyde kirstendunst jonath...
2,Grumpier Old Men (1995),"[waltermatthau, jacklemmon, sophialoren]",markstevenjohnson,boat lake neighbor rivalry,comedy romance,waltermatthau jacklemmon sophialoren marksteve...
3,Waiting to Exhale (1995),"[whitneyhouston, angelabassett, lorettadevine]",terrymcmillan,black american husband wife relationship betra...,comedy drama romance,whitneyhouston angelabassett lorettadevine ter...
4,Father of the Bride Part II (1995),"[stevemartin, dianekeaton, martinshort]",alberthackett,fatherhood doberman dog mansion,comedy,stevemartin dianekeaton martinshort alberthack...


In [28]:
moviez = df_1[:3000]

In [23]:
def collab_model(movie_list,top_n=10):
    
    recommended_movies = []
    movies = moviez

    # generating the count matrix and cosine similarity matrix
    count = CountVectorizer()
    count_matrix = count.fit_transform(movies['KeyWords'])
    cosine_sim = cosine_similarity(count_matrix, count_matrix)
    cosine_sim = pd.DataFrame(cosine_sim, index = movies.index, columns = movies.index)
    
    # series for numerical ordered list of movie titles
    indices = pd.Series(movies['title'])
    
    # getting the index of the movie that matches the titles on movie_list
    idx_1 = indices[indices == movie_list[0]].index[0]
    idx_2 = indices[indices == movie_list[1]].index[0]
    idx_3 = indices[indices == movie_list[2]].index[0]
    
    # computing similarity scores
    rank_1 = cosine_sim[idx_1]
    rank_2 = cosine_sim[idx_2]
    rank_3 = cosine_sim[idx_3]
    
    # creating a Series with the similarity scores in descending order
    score_series_1 = pd.Series(rank_1).sort_values(ascending = False)
    score_series_2 = pd.Series(rank_2).sort_values(ascending = False)
    score_series_3 = pd.Series(rank_3).sort_values(ascending = False)
    
    # merging similarity scores seriese
    score_series = score_series_1.append(score_series_2).append(score_series_3).sort_values(ascending = False)
    
    # list of of movie indices
    top_indexes = list(score_series.index)
    
    # Removing indices of movies in movie_list
    top_indexes = np.setdiff1d(top_indexes,[idx_1,idx_2,idx_3])
    
    #returning movies of top indices
    for i in top_indexes[:top_n]:
        recommended_movies.append(list(movies['title'])[i])
    return recommended_movies

In [29]:
movie_list = ['How to Be a Player (1997)','Tommy Boy (1995)','Father of the Bride Part II (1995)']
collab_model(movie_list,top_n=10)

['Toy Story (1995)',
 'Jumanji (1995)',
 'Grumpier Old Men (1995)',
 'Waiting to Exhale (1995)',
 'Heat (1995)',
 'Sabrina (1995)',
 'Tom and Huck (1995)',
 'Sudden Death (1995)',
 'GoldenEye (1995)',
 'American President, The (1995)']

In [None]:
def data_preprocessing(subset_size):
    """Prepare data for use within Content filtering algorithm.

    Parameters
    ----------
    subset_size : int
        Number of movies to use within the algorithm.

    Returns
    -------
    Pandas Dataframe
        Subset of movies selected for content-based filtering.

    """

    def to_string(df):
         for col in df.columns:
            if df[col].dtype in ['int64','float','object']:
                    df[col] = df[col].astype(str)
        return df

    df_1 = to_string(movies_df)

    df_1['director'] = df_1['director'].apply(lambda x: "".join(x.lower() for x in x.split()))

    df_1['title_cast'] = df_1['title_cast'].apply(lambda x: "".join(x.lower() for x in x.split()))
    df_1['title_cast'] = df_1['title_cast'].map(lambda x: x.split('|')[:3])

    df_1['plot_keywords'] = df_1['plot_keywords'].map(lambda x: x.split('|')[:5])
    df_1['plot_keywords'] = df_1['plot_keywords'].apply(lambda x: " ".join(x))

    # Discarding the pipes between the genres 
    df_1['genres'] = df_1['genres'].map(lambda x: x.lower().split('|'))
    df_1['genres'] = df_1['genres'].apply(lambda x: " ".join(x))

    df_1.set_index('title', inplace = True)

    df_1['KeyWords'] = ''
    columns = df_1.columns
    for index, row in df_1.iterrows():
         words = ''
         for col in columns:
             if col not in ['director','plot_keywords','genres']:
                 words = words + ' '.join(row[col])+ ' '
             else:
                 words = words + row[col]+ ' '
         row['KeyWords'] = words

    df_1.reset_index(inplace=True)

    # Subset of the data
    movies_subset = movies[:subset_size]
    return movies_subset