In [None]:
import numpy as np
import pandas as pd 
import spacy
from spacy import displacy
import ast
import re
import nltk
from nltk.stem import WordNetLemmatizer
import sklearn
import calendar

In [None]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [None]:
movies = movies.merge(credits,on = 'title')

In [None]:
movies = movies[['id','title','overview','genres','keywords','cast','crew','spoken_languages','release_date']]

In [None]:
movies.dropna(inplace=True)

In [None]:
# pd.to_datetime(movies.iloc[0].release_date)
movies['release_date'] = pd.to_datetime(movies['release_date'])

In [None]:
movies['year'] = movies['release_date'].dt.year

In [None]:
movies = movies.drop(['release_date'], axis=1)

In [None]:
movies['year'] = movies['year']//20
# to make a group of 20 years movie ranges

In [None]:

movies = movies.astype({'year':'string'})
movies['year'].dtype


In [None]:
movies.isnull().sum()

In [None]:
movies.dropna(inplace=True)

In [None]:
def Extract_tags(obj):
    L = []
    for i in eval(obj):
        L.append(i['name'])
    return L

In [None]:
movies['genres'] = movies['genres'].apply(Extract_tags)

In [None]:
movies['keywords'] = movies['keywords'].apply(Extract_tags)

In [None]:
def Extract_cast(obj):
    L = []
    count = 0
    for i in eval(obj):
        if count > 3 :
            break
        L.append(i['name'])
        count = count + 1
    return L

In [None]:
movies['cast'] = movies['cast'].apply(Extract_cast)

In [None]:
import pickle
Id_overview = movies[['id','overview']]
pickle.dump(Id_overview,open("ID_overview.pkl","wb"))

In [None]:
def Extract_crew_director(obj):
    L = []
    for i in eval(obj):
        if i['job'] == "Director" :
            L.append(i['name'])
            break
    return L

In [None]:
movies['crew'] = movies['crew'].apply(Extract_crew_director)

In [None]:
import pickle
id_crew = movies[['id','crew']]
pickle.dump(Id_overview,open("ID_CREW.pkl","wb"))

In [None]:
nlp = spacy.load("en_core_web_lg")
nlp_sm = spacy.load("en_core_web_sm")

In [None]:
from collections import Counter
def NRE_overview(obj):
    all_orgs = []
    ans = []
    doc = nlp(obj)
    for d in doc:
        orgs = [d.text for ent in d.ent_type_ if d.ent_type_ == "PERSON" or d.ent_type_ == "ORG" or d.ent_type_ == "NORP" or d.ent_type_ == "EVENT" or d.ent_type_ == "WORK_OF_ART"]
        all_orgs.extend(orgs)
    l = Counter(all_orgs).most_common(15)
    for i in l:
        ans.append(i[0])    
    return ans

In [None]:
movies['overview'] = movies['overview'].apply(NRE_overview)

In [None]:
movies['genres'] = movies['genres'].apply(lambda x : [i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x : [i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x : [i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x : [i.replace(" ","") for i in x])

In [None]:
movies['tags'] = movies['overview'] + movies['cast'] + movies['crew'] + movies['keywords'] + movies['genres'] 

In [None]:
Updated_dataframe = movies[['id','title','tags','year']]

In [None]:
Updated_dataframe['tags'] = Updated_dataframe['tags'].apply(lambda x : " ".join(x))

In [None]:
Updated_dataframe['tags'] = Updated_dataframe['tags'] + " " + Updated_dataframe['year'] 

In [None]:
Updated_dataframe['tags'] = Updated_dataframe['tags'].apply(lambda x : x.lower())

In [None]:
Updated_dataframe.iloc[7].tags

In [None]:
stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()
# nltk.download('stopwords')
# nltk.download('wordnet')

In [None]:
def remove_stop_words(obj):
    doc = obj
    doc = doc.lower()
    doc = doc.split()
    doc = [lemmatizer.lemmatize(word) for word in doc if not word in set(stopwords)]
    doc = ' '.join(doc)
    return doc

In [None]:
# movies['tags'] = movies['tags'].apply(remove_stop_words)
Updated_dataframe['tags'] = Updated_dataframe['tags'].apply(remove_stop_words)

In [None]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [None]:
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i)) 
    return " ".join(y)

In [None]:
Updated_dataframe['tags'] = Updated_dataframe['tags'].apply(stem)

In [None]:
list_tags = list(Updated_dataframe['tags'])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer() 
vectors = vectorizer.fit_transform(list_tags)

In [None]:
print("n_samples: %d, n_features: %d" % vectors.shape)

In [None]:
# Select the first five documents from the data set
tf_idf = pd.DataFrame(vectors.todense())
tf_idf.columns = vectorizer.get_feature_names_out()
tfidf_matrix = tf_idf.T
# columns are the movies
# rows are the features/tags extracted 
tfidf_matrix.columns = [Updated_dataframe.iloc[i].title for i in range(0, 4805)]
tfidf_matrix['count'] = tfidf_matrix.sum(axis=1)
# All  features to be used 
# *************  PLEASE CHECK WITH DIFFERNT NUMBER OF ALLOWED FEATURE , TOp 100 or something like that ***************
tfidf_matrix = tfidf_matrix.sort_values(by ='count', ascending=False)[:1000] 
# Print the whole matrix
print(tfidf_matrix.drop(columns=['count']))

In [None]:
tfidf_matrix.drop(['count'], axis=1,inplace=True)

In [None]:
Vectorirsed_data = tfidf_matrix.transpose()

In [None]:
Vectorirsed_data.iloc[0].sort_values()

In [None]:
numpy_vector_form_data = Vectorirsed_data.values

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
interaction_vector_movies = cosine_similarity(numpy_vector_form_data)

In [None]:
def recommend(movie):
    index_movie = Updated_dataframe[Updated_dataframe['title'] == movie].index[0]
    movie_vector =sorted( list( enumerate(interaction_vector_movies[index_movie])) , reverse=True , key = lambda x:x[1])
    for i in range(1,11):
        print(Updated_dataframe.iloc[movie_vector[i][0]].id)
        print(Updated_dataframe.iloc[movie_vector[i][0]].title)


In [None]:
recommend('Batman Begins')

In [None]:
import pickle
pickle.dump(Updated_dataframe , open('Movies_re.pkl','wb'))

In [None]:
pickle.dump(interaction_vector_movies ,open("interaction_vector_movies.pkl",'wb'))