In [None]:
import numpy as np
import pandas as pd

In [None]:
movies = pd.read_csv("Datasets/tmdb_5000_movies.csv")
credits = pd.read_csv("Datasets/tmdb_5000_credits.csv")

In [None]:
movies.head(1)

In [None]:
credits.head(1)

## Merge two dataframes

In [None]:
movies = movies.merge(credits, on='title')

In [None]:
movies.head(1)

## We will keep this columns only 
1. **genres**
2. **id**
3. **keywords**
4. **title**
5. **overview**
6. **cost**
6. **crew**

In [None]:
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

In [None]:
movies.head(1)

**In our final dataframe we will have only three columns - movie_id, title, tags**

According to this we will now prepare the data

In [None]:
movies.isnull().sum() # check if there is any missing data

In [None]:
movies.dropna(inplace=True) # remove missing values

In [None]:
movies.duplicated().sum() # check for duplicate rows

In [None]:
movies.iloc[0].genres 

In [None]:
# '[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'
# ['Action', 'Adventure', 'Fantasy', 'Science Fiction']

In [None]:
# def convert(obj):
#     l = []
#     for i in obj:
#         l.append(i['name'])
#         return l

# convert(movies.iloc[0].genres)

```
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[23], line 1
----> 1 convert(movies.iloc[0].genres)

Cell In[22], line 4
      2 l = []
      3 for i in obj:
----> 4     l.append(i['name'])
      5     return l

TypeError: string indices must be integers
```

- **this problem is expected because this is a string of lists**
- **to fix this we need to convert this into a list**
- **we will use ast python module for this.**
(```ast.literal_eval```)

In [None]:
import ast
def convert(obj):
    l = []
    for i in ast.literal_eval(obj):
        l.append(i["name"])
    return l

In [None]:
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

In [None]:
movies.head()

In [None]:

def convert2(obj):
    l = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            l.append(i["name"])
            counter += 1
        else:
            break
    return l

In [None]:
movies['cast'] = movies['cast'].apply(convert2)

In [None]:
movies.head()

In [None]:
def fetch_director(obj):
    l = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            l.append(i["name"])
            break
    return l

In [None]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [None]:
movies.head()

In [None]:
movies['overview'][0]

In [None]:
movies['overview'] = movies['overview'].apply(lambda x: x.split()) # convert overview text into a list of words

In [None]:
movies.head()

## Now we need to remove spaces between the names to convert them into a single word

- **Science Fiction -> ScienceFiction**
- **Johnny Depp -> JohnnyDepp**

In [None]:
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ", "") for i in x]) # convert genres text into a list of genres
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ", "") for i in x]) # convert genres text into a list of genres
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ", "") for i in x]) # convert genres text into a list of genres
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ", "") for i in x]) # convert genres text into a list of genres

In [None]:
movies.head()

## tags = overview + genres + keywords + cast + crew 

concatinate

In [None]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [None]:
movies.head()

In [None]:
final_df = movies[['movie_id', 'title', 'tags']]

In [None]:
final_df

In [None]:
final_df['tags'] = final_df['tags'].apply(lambda x: " ".join(x)) # convert list of words into a single string

In [None]:
final_df.head()

In [None]:
# convert to lower case
final_df['tags'] = final_df['tags'].apply(lambda x: x.lower())

In [None]:
final_df.head()

## apply stemming 

**['love', 'loving', 'loved']** <br/>
**['love', 'love', 'love']**

In [None]:
import nltk

In [None]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()

In [None]:
def stem(text):
    y = []

    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [None]:
final_df['tags'] = final_df['tags'].apply(stem) # apply stemming to the tags

# Using Scikit Learn

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

In [None]:
vectors = cv.fit_transform(final_df['tags']).toarray() # convert tags into a matrix of token counts

In [None]:
cv.get_feature_names_out() # get the feature names (words) from the CountVectorizer

In [None]:
"""
['love', 'loving', 'loved']
-- apply stemming (cell no. -> 37)
['love', 'love', 'love']
"""

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
similarity = cosine_similarity(vectors) # calculate the cosine similarity between the vectors

In [None]:
similarity.shape # check the shape of the similarity matrix

In [None]:
similarity[0] # check the similarity scores for the particular movie

In [None]:
sorted(list(enumerate(similarity[0])), reverse=True, key=lambda x: x[1])[1:6] # get the top 5 similar movies

In [None]:
def recomended(movie):
    movie_idx = final_df[final_df['title'] == movie].index[0] # get the index of the movie
    distances = similarity[movie_idx] # get the similarity scores for the movie
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]

    for i in movies_list:
        print(final_df.iloc[i[0]]['title'])

In [None]:
recomended('Batman') # test the recommender system with a movie title

In [None]:
import pickle

In [None]:
pickle.dump(final_df, open('movies.pkl', 'wb')) # save the final dataframe to a pickle file