In [None]:
!pip install nltk

In [None]:
import re
import ast
import pickle

import numpy as np
import pandas as pd
import seaborn as sns

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings('ignore')

In [None]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [None]:
movies.head(3)

In [None]:
credits.head()

In [None]:
movies.shape

In [None]:
credits.shape

In [None]:
movies.columns, credits.columns

In [None]:
# Merge the Dataframes 
movies = movies.merge(credits, on='title')

In [None]:
movies.shape

## Choose the relevant features for movie recommendation
- movie_id
- title
- overview
- genres
- keywords
- cast
- crew 

In [None]:
movies[:2]

In [None]:
df = movies[['movie_id','title','overview','genres','keywords','cast','crew']]
df.head()

In [None]:
df.shape

## Final Goal - 'movie_id' + 'title' + 'tags’

In [None]:
df['genres’]

In [None]:
df['genres'][0]

In [None]:
import ast

def fetch_genres(text):
    l=[]
    for i in ast.literal_eval(text):
        l.append(i['name'])
    return l

df['genres'] = df['genres'].apply(fetch_genres)

In [None]:
def fetch_keywords(text):
    l=[]
    for i in ast.literal_eval(text):
        l.append(i['name'])
    return l

df['keywords'] = df['keywords'].apply(fetch_keywords)

In [None]:
def fetch_cast(text):
    l=[]
    counter = 0
    for i in ast.literal_eval(text):
        if counter != 3:
            l.append(i['name'])
            counter+=1
        else:
            break
    return l

df['cast'] = df['cast'].apply(fetch_cast)

In [None]:
def fetch_director(text):
    l=[]
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            l.append(i['name'])
    return l

df['crew'] = df['crew'].apply(fetch_director)

In [None]:
df.dropna(inplace=True)

## Overview

In [None]:
df['overview'] = df['overview'].apply(lambda x:x.split())

In [None]:
df['tags'] = df['overview'] + df['genres'] + df['keywords'] + df['cast'] + df['crew']

## Final DataFrame

In [None]:
data = df[['movie_id','title','tags']]

In [None]:
print(data['tags'][0])

In [None]:
data['tags'] = data['tags'].apply(lambda x:[i.replace(' ','') for i in x])

In [None]:
data['tags'] = data['tags'].apply(lambda x:' '.join(x))

In [None]:
print(data['tags'][0])

## NLP (Processing Textual data)
- Lower Case
- Stemming
- Tokenization
- Stopwords Removal

In [None]:
data[:5]

In [None]:
data['tags']

#### Text Preprocessing

In [None]:
ps = PorterStemmer()

def stem(text):
    y=[]
    for i in text.split():
        lower = i.lower()
        y.append(ps.stem(lower))
    return ' '.join(y)

data['tags'] = data['tags'].apply(stem)

In [None]:
data['tags']

In [None]:
data

#### BOW (Bag of Words)

In [None]:
cv = CountVectorizer(max_features=5000, stop_words='english')

In [None]:
vectors = cv.fit_transform(data['tags']).toarray()

In [None]:
for i in cv.get_feature_names_out():
    print(i)

#### Cosine Similarity

In [None]:
vectors

In [None]:
similarity = cosine_similarity(vectors)
similarity

In [None]:
list(enumerate(similarity[0])

In [None]:
sorted(list(enumerate(similarity[0])), reverse=True, key=lambda x: x[1])[1:10]

## Final Function

In [None]:
def recommend(movie):
    movie_index = data[data['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]
    for i in movies_list:
        print(data.iloc[i[0]].title)

In [None]:
recommend('Avatar')

In [None]:
recommend('Iron Man')