<a href="https://colab.research.google.com/github/monicafar147/unsupervised-predict-streamlit-template/blob/modelling/content_based.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Content based prediction
In Content-based Filtering, we seek to make recommendations based on how similar the properties or features of an item are to other items.

In [1]:
import pandas as pd
import numpy as np
# Import models
from surprise import NormalPredictor
from surprise import KNNBasic
from surprise import SVD
from surprise.model_selection import GridSearchCV
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
imdb = pd.read_csv('Documents/movies/imdb_data.csv')
train = pd.read_csv('Documents/movies/train.csv')
movies = pd.read_csv('Documents/movies/movies.csv')

In [3]:
# Generate a dataframe df by mergin the train, movies and imdb datsets on movieId
df=pd.merge(movies,train,how='outer',on='movieId') # movies and train outer join
df=pd.merge(df,imdb,how='left',on='movieId') # movies and train left join

In [4]:
column_list=['timestamp','runtime','budget']
df.drop(column_list,axis=1,inplace=True)

In [None]:
# Check df shape
row,column=df.shape
print('The dataframe has {} rows and {} columns.'.format(row, column))

The dataframe has 10014248 rows and 7 columns.


In [None]:
df.head(5)

Unnamed: 0,movieId,title,genres,userId,rating,title_cast,director
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,158849.0,5.0,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,97203.0,5.0,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,161871.0,3.0,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,45117.0,4.0,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,27431.0,5.0,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter


In [5]:
# Function takes in text as input and removes pipe
def remove_pipe(text):
  text=str (text)
  text=text.replace("|",",")
  return text

In [6]:
# Apply remove_pipe to columns title_cast and genres
df['genres']=df['genres'].fillna('') #fill missing values with blanks
df['title_cast']=df['title_cast'].fillna('') #fill missing values with blanks

df['title_cast']=df['title_cast'].apply(remove_pipe) #apply remove_pipe sunction
df['genres']=df['genres'].apply(remove_pipe)
df['plot_keywords'] = df['plot_keywords'].apply(remove_pipe)
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,title_cast,director,plot_keywords
0,1,Toy Story (1995),"Adventure,Animation,Children,Comedy,Fantasy",158849.0,5.0,"Tom Hanks,Tim Allen,Don Rickles,Jim Varney,Wal...",John Lasseter,"toy,rivalry,cowboy,cgi animation"
1,1,Toy Story (1995),"Adventure,Animation,Children,Comedy,Fantasy",97203.0,5.0,"Tom Hanks,Tim Allen,Don Rickles,Jim Varney,Wal...",John Lasseter,"toy,rivalry,cowboy,cgi animation"
2,1,Toy Story (1995),"Adventure,Animation,Children,Comedy,Fantasy",161871.0,3.0,"Tom Hanks,Tim Allen,Don Rickles,Jim Varney,Wal...",John Lasseter,"toy,rivalry,cowboy,cgi animation"
3,1,Toy Story (1995),"Adventure,Animation,Children,Comedy,Fantasy",45117.0,4.0,"Tom Hanks,Tim Allen,Don Rickles,Jim Varney,Wal...",John Lasseter,"toy,rivalry,cowboy,cgi animation"
4,1,Toy Story (1995),"Adventure,Animation,Children,Comedy,Fantasy",27431.0,5.0,"Tom Hanks,Tim Allen,Don Rickles,Jim Varney,Wal...",John Lasseter,"toy,rivalry,cowboy,cgi animation"


Taking the df dataframe. A filtered dataframe with unique movie titles can be generated to remove duplicate movie titles due to multiple user ratings.

In [25]:
cast = df[['title','rating','title_cast']]
genres = df[['title','rating','genres']]
keywords = df[['title','rating','plot_keywords']]
directors = df[['title','rating','director']].dropna()

In [27]:
word = 'Christopher Nolan'

In [28]:
print(recommend(word,directors,'director'))

title
Memento (2000)      0
Inception (2010)    1
Following (1998)    2
dtype: int64


In [9]:
def recommend(word, df, column_name):
  '''takes in a word as argument
    returns a predicted movie list a user will like
  '''
  subset = df[df[column_name].str.contains(word)] 
  # A ratings dataframe can be generated from uniw\que movie titles and their average ratings across all users
  ratings=subset.groupby(['title'])['rating'].mean() #group by title and aggregate ratings
  ratings=pd.DataFrame(ratings)
  ratings=ratings.fillna(2) # fill missing values with generic score of 2 (40%)
  ratings['num of ratings']=df.groupby(['title'])['rating'].count() #generate a column witht the total number of ratings

  content=df.groupby(['title'])[column_name].unique() #group by title and display unique values in a list
  content=pd.DataFrame(content)

  movie_df=pd.merge(ratings,content,on='title') #join on index  

  # Sort move_df from highest to lowest num of ratings
  movie_df=movie_df.sort_values('num of ratings',ascending=False)

  feature=movie_df[[column_name]][0:30000]

  feature.reindex()
  feature = feature.rename(columns={1:'column_name'})

  # Set up vectorizer
  vec=TfidfVectorizer(stop_words='english') # term frequency inverse document frequency

  # Vectorize combined column
  feature_vec=vec.fit_transform(feature)

  # Use cosine similarity to get similarity matrix
  sim_matrix =cosine_similarity(feature_vec,feature_vec)

  # Reset the move_df index as it will be used to generate a Series that will map sim_mat
  movie_df=movie_df.reset_index()

  # Index mapping
  index_map = pd.Series(movie_df.index,index = movie_df['title']) # contains a series of the movie titles in movie_df

  return index_map[0:10]