## What are Recommender Systems ?


## Types of Recommender Systems

## Use Cases and Applications :

## Why Recommender Systems?

## Project Flow

In [None]:
# DATA -> PREPROCESSING -> MODEL -> WEBSITE -> DEPLOY ON SERVER

In [None]:
#importing libraries
import pandas as pd
import numpy as np

In [None]:
#TMDB 5000 DATASET
credits = pd.read_csv("../Movie Recommender System/data/tmdb_5000_credits.csv")
credits.head()

In [None]:
print(credits.shape)
credits.columns

In [None]:
movies = pd.read_csv("../Movie Recommender System/data/tmdb_5000_movies.csv")
movies.head()

In [None]:
print(movies.shape)
movies.columns

In [None]:
#joining 2 dbs over title column
data = movies.merge(credits, on = "title")
data.shape

In [None]:
data.head()
data.info()

In [None]:
data["original_language"].value_counts()
#most of the movies are in "en"

In [None]:
#recommending movies based on tags which were created on columns
#genres                              #id
#keywords                            #title(in English)
#overview(summary)                   #release date
#revenue                             #cast
#crew

In [None]:
data = data[["movie_id","genres","title","overview","keywords","release_date","cast","crew","revenue"]]
data.head(2)

In [None]:
#create a new dataframe with cols (movie_id, title, tags)
#merging rest cols to get tags col
#(top actors in cast, directors in crew, imp words in genres,keywords append to overview)
data["release_date"] = pd.to_datetime(data["release_date"])
data["year"] = data["release_date"].dt.year
data.info()

In [None]:
#changing data types of columns year and revenue
data["revenue"] = data["revenue"].astype(str)
data["year"] = data["year"].astype(str)
data.info()

In [None]:
#drop column release date
data.drop("release_date", axis = 1, inplace=True)

In [None]:
#checking if null values are there or not
data.isna().sum()

In [None]:
data = data.dropna()
data.isna().sum()

In [None]:
#checking for duplicate data
data.duplicated().sum()

In [None]:
data.loc[0,"genres"]
#genre col data is a list of dicts 2nd values are actual genre

# genres = [item["name"] for item in mylist]
# mylist = '[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'
# print(genres)
# => ['Action', 'Adventure', 'Fantasy', 'Science Fiction']

In [None]:
import ast
def get_genre(obj):
  mylist = []
  for item in ast.literal_eval(obj):
    mylist.append(item["name"])
  return mylist

In [None]:
data["genres"] = data["genres"].apply(get_genre)

In [None]:
data.loc[0,"keywords"]
#keyword col data is also a string form of list of dicts whose 2nd values are needed

In [None]:
import ast
def get_keyword(obj):
  mylist = []
  for item in ast.literal_eval(obj):
    mylist.append(item["name"])
  return mylist

In [None]:
data["keywords"] = data["keywords"].apply(get_keyword)

In [None]:
data.head(2)

### getting top 3 members in cast

In [None]:
data.loc[0,"cast"]

In [None]:
def get_cast3(obj):
  ctr = 0
  mylist = []
  for item in ast.literal_eval(obj):
    if(ctr<3):
      mylist.append(item["name"])
    else:
      break
    ctr += 1
  return mylist

In [None]:
data["cast"] = data["cast"].apply(get_cast3)

### getting director from crew

In [None]:
data.loc[0,"crew"]

In [None]:
import ast
def get_dir(obj):
  mylist = []
  for i in ast.literal_eval(obj):
    if(i["job"]=="Director"):
      mylist.append(i["name"])
      break
  return mylist

In [None]:
data["crew"] = data["crew"].apply(get_dir)

### converting overview to list

In [None]:
data["overview"] = data["overview"].apply(lambda x: x.split())

In [None]:
data.head()

### Transformation of columns: (removing blank spaces between words of same phrase)

In [None]:
columns = ["genres","overview","keywords","cast","crew"]
for col in columns:
  data[col] = data[col].apply(lambda x:[i.replace(" ","") for i in x])

In [None]:
data.head(2)

In [None]:
data["revenue"] = data["revenue"].apply(lambda x : x.split())
data["year"] = data["year"].apply(lambda x : x.split())

### Concatenate all cols into a tag column

In [None]:
data["tag"] = data["genres"]+data["overview"]+data["keywords"]+data["cast"]+data["crew"]+data["revenue"]+data["year"]
data.loc[0,"tag"]

### Using new dataframe with 3 columns: movie_id, title, tag

In [None]:
movies = data[['movie_id', 'title', 'tag']]
movies.loc[0,"tag"]

In [None]:
movies["tag"] = movies["tag"].apply(lambda x: " ".join(x))

In [None]:
movies.head()

In [None]:
movies.tag[0]

In [None]:
movies["tag"] = movies["tag"].apply(lambda x: x.lower())

In [None]:
movies.tag[0]

In [None]:
movies.tag[1]

### Core Idea

In [None]:
#calculating similarity score between 2 tags, more the score, more similar the movies will be
#using text-vectorization -> converting each text in tag col into a vector and using closely lying vectors(min dist of sep)

#techniques are: BagofWords, TF-IDF, Word2Vec

In [None]:
# each movie as a vector in 2D space -> closest vectors will be recommended
# don't consider stop words(a,in,is,on,to,for,and,are,from,etc..)
# consider a large tag as combination of all tags in tag col find most frequently(first 5000) occurred words which would be axes for out 5000dim space
# check occurrences of those in each film in db -> we would get a new db(~ 4800x5000)

In [None]:
movies.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 5000,stop_words = "english")

In [None]:
vectors = cv.fit_transform(movies["tag"]).toarray()

In [None]:
vectors

In [None]:
dir(cv)

In [None]:
dict_freq = cv.vocabulary_

In [None]:
from collections import Counter
most_freq = Counter(dict_freq).most_common(5000)
for key, value in most_freq:
    print(f"{key} : {value}")

In [None]:
mylist = list([key for key, value in most_freq])
mylist.reverse()
mylist

In [None]:
#applying stemming operation to remove redundant words 
#["actions","action","acting", ...] = ["act"] => ps.stem("acting")

In [None]:
###removing similar words adding no extra meaning to corpus
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [None]:
def stem(txt):
    y = []
    for i in txt.split():
        y.append(ps.stem(i))
    
    return " ".join(y)

In [None]:
ps.stem(movies.tag[0])

In [None]:
###apply to all rows of tag col
movies["tag"] = movies["tag"].apply(stem)

In [None]:
cv2 = CountVectorizer(max_features = 5000, stop_words = "english")
vectors2 = cv.fit_transform(movies["tag"]).toarray()

In [None]:
dict_freq = cv.vocabulary_

from collections import Counter
most_freq = Counter(dict_freq).most_common(5000)

mylist = list([key for key, value in most_freq])
mylist.reverse()

mylist

### similarity using cosine distance

In [None]:
#each movie->vector ; 4806 movies and vectors(each has 5000 words in it) 
#find similarity between two movies, lesser the distance more will be the similarity
#using cosine distance

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
sim_mat = cosine_similarity(vectors)

In [None]:
sim_mat.shape

In [None]:
sim_mat[0]
###preserving indices of movies with distances
###getting first 5 movies other than itself
sorted(list(enumerate(sim_mat[0])), reverse = True, key = lambda x:x[1])[1:10]

In [None]:
def recommend(movie):
    index_mov = movies[movies["title"]==movie].index[0]
    dist_mov = sim_mat[index_mov]
    sim_mov_rec = sorted(list(enumerate(dist_mov)), reverse = True, key = lambda x:x[1])[1:6]
    
    for mov in sim_mov_rec:
        print(movies.iloc[mov[0]]["title"])

In [None]:
recommend("Batman Begins")
#The Dark Knight
#The Dark Knight Rises
#Batman
#Batman & Robin
#Batman
#Batman v Superman: Dawn of Justice
#Defendor
#Amidst the Devil's Wings
#Batman Returns

In [None]:
movies.title.values#array of all the movies in tmdb

In [149]:
import pickle
pickle.dump(movies.to_dict(), open("../Movie Recommender System/movie_dict.pkl","wb"))

In [None]:
movies.to_dict()

In [150]:
pickle.dump(sim_mat, open("../Movie Recommender System/sim_mat.pkl","wb"))