In [1]:
import numpy as np
import pandas as pd

## reading the datasets

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv') 

## merge both datasets into one

In [3]:
movies = movies.merge(credits,on="title")

## cleaning the dataset according the the requirement

In [4]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status               

## final dataset to be worked on

In [5]:
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

## creating a new dataset using the cleaned dataset

### checking null value

In [6]:
movies.dropna(inplace=True)

### checking duplicated data

In [7]:
movies.duplicated().sum()

0

## converting the columns according to requirment

In [8]:
import ast
ast.literal_eval

<function ast.literal_eval(node_or_string)>

In [9]:
def convert(obj):
    L=[]
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [10]:
movies['genres']=movies['genres'].apply(convert)

In [11]:
movies['genres']=movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])

In [12]:
movies['tags']=movies['genres']

In [13]:
new_df=movies[['movie_id','title','tags']]

In [14]:
new_df['tags']=new_df['tags'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(lambda x:" ".join(x))


In [15]:
new_df['tags']=new_df['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(lambda x:x.lower())


## converting all tags to vector using bag of words technique

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [17]:
vector = cv.fit_transform(new_df['tags']).toarray()

In [18]:
vector

array([[1, 1, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [19]:
from sklearn.metrics.pairwise import cosine_similarity

In [20]:
similarity = cosine_similarity(vector)

In [21]:
def recommend(movie):
    index = new_df[new_df['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:7]:
        print(new_df.iloc[i[0]].title)

## Recommend

In [22]:
recommend('Batman Begins')

The Gunman
The Punisher
Antitrust
Brick Mansions
The Newton Boys
Cradle 2 the Grave


## using the dataset for frontend

In [23]:
import pickle

In [24]:
pickle.dump(new_df,open('movie_list.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))

In [25]:
new_df['title'].values

array(['Avatar', "Pirates of the Caribbean: At World's End", 'Spectre',
       ..., 'Signed, Sealed, Delivered', 'Shanghai Calling',
       'My Date with Drew'], dtype=object)

In [28]:
pickle.dump(new_df.to_dict( ),open('movie_dict_genres.pkl','wb'))

In [29]:
pickle.dump(similarity,open('similarity_genres.pkl','wb'))