### Movie Recommander System project

In [1]:
import pandas as pd 
import numpy as np

In [2]:
movies = pd.read_csv("dataset/movies.csv")
credits = pd.read_csv("dataset/credits.csv")

In [3]:
movies.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [4]:
credits.head(2)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [5]:
# Merging between movies dataset and credits one
movies = movies.merge(credits,left_on="id",right_on="movie_id")

In [6]:
# Select the key features for the project
movies = movies[["id","genres","keywords","title_x","overview","cast","crew"]].rename(columns={"title_x":"title"})

In [7]:
# Kicking out records with null value
movies.isnull().any()
movies = movies.dropna()

In [8]:
movies = movies.drop_duplicates()

In [9]:
# Converting Json strings in genres and keywords attributes to ordinary list

import json

def convert(json_ch):
    res = []
    tab = json.loads(json_ch)
    for elem in tab:
        res.append(elem["name"])
    return res

movies["genres"] = movies["genres"].apply(convert)
movies["keywords"] = movies["keywords"].apply(convert)

In [10]:
# Targeting cast attribute

def choose3(json_ch):
    res = []
    L = json.loads(json_ch)
    count = 0 
    for x in L:
        if count < 3 :
            res.append(x["name"])
            count+=1
        else:
            break
    return res

movies["cast"] = movies["cast"].apply(choose3)

In [11]:
# Targeting crew attribute

def choose_director(json_ch):
    res = []
    L  = json.loads(json_ch)
    for x in L : 
        if x["job"] == "Director":
            res.append(x["name"])
    return res

movies["crew"] = movies["crew"].apply(choose_director)

In [12]:
#Targeting overview attribute

def aslist(ch):
    return ch.split(" ")

movies["overview"] = movies["overview"].apply(aslist)

In [13]:
movies.head(5)

Unnamed: 0,id,genres,keywords,title,overview,cast,crew
0,19995,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...",Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...",Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,49026,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...",The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,49529,"[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...",John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


In [14]:
# Eliminating spaces in genres, keywords, cast and crew

def nospace(L):
    return [elem.replace(" ","") for elem in L]

for elem in ("genres" , "keywords" , "cast" , "crew"):
    movies[elem] = movies[elem].apply(nospace)

In [15]:
# Creating tag attribute & adding new dataframe
movies["tag"] = movies["overview"] + movies["genres"] + movies["keywords"] + movies["cast"] + movies["crew"]
new_df = movies[["id" , "title" , "tag"]]
new_df.head(5)

new_df["tag"] = new_df["tag"].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["tag"] = new_df["tag"].apply(lambda x:" ".join(x))


In [16]:
new_df["tag"] = new_df["tag"].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["tag"] = new_df["tag"].apply(lambda x: x.lower())


In [17]:
new_df.head()

Unnamed: 0,id,title,tag
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


In [18]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer
import nltk

class StemmedCountVectorizer(CountVectorizer):

    def build_analyzer(self):
        analyzer = super().build_analyzer()
        ps = PorterStemmer()  # Corrected the class name here
        return lambda doc: [ps.stem(word) for word in analyzer(doc)]

In [19]:
new_df.shape

(4800, 3)

In [21]:
# We make a choice of 5000 relevant words so it doesn't crash in future computations
SVC = StemmedCountVectorizer(stop_words='english',max_features=5000)

BOW_matrix = SVC.fit_transform(new_df["tag"]).toarray()
vocabulary = SVC.get_feature_names_out()

print("vocabulary length is",len(vocabulary))
print(BOW_matrix)

vocabulary length is 5000
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [26]:
import string

def remove_punctuation_and_extra_spaces(text):
    # Remove punctuation
    text_without_punctuation = text.translate(str.maketrans('', '', string.punctuation))
    # Remove extra spaces and return the cleaned text
    return ' '.join(text_without_punctuation.split())

new_df["tag"] = new_df["tag"].apply(remove_punctuation_and_extra_spaces)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["tag"] = new_df["tag"].apply(remove_punctuation_and_extra_spaces)


In [28]:
# We will stem the tags

ps =  PorterStemmer()

def stem(text):
    res=[]
    words=text.split(" ")
    for word in words:
        res.append(ps.stem(word))
    return " ".join(res) 

new_df["tag"] = new_df["tag"].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["tag"] = new_df["tag"].apply(stem)


In [32]:
# We will delete the stopwords from the text
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

stop_words = set(stopwords.words('english'))

def remove_SW(text):
    textList = text.split(" ")
    text_tokens = word_tokenize(text)
    filtered_text = " ".join([word for word in text_tokens if word not in stop_words])
    return filtered_text

new_df["tag"] = new_df["tag"].apply(remove_SW)

original text is : in the 22nd centuri a parapleg marin is dispatch to the moon pandora on a uniqu mission but becom torn between follow order and protect an alien civil action adventur fantasi sciencefict cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d samworthington zoesaldana sigourneyweav jamescameron
filtered text is : 22nd centuri parapleg marin dispatch moon pandora uniqu mission becom torn follow order protect alien civil action adventur fantasi sciencefict cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d samworthington zoesaldana sigourneyweav jamescameron


In [39]:
# Let's compute similarities 
from sklearn.metrics.pairwise import cosine_similarity

n = len(BOW_matrix)

def recommend(movie):
    """
    This function provides five recommended movies based on the cosine similarity
    metric using a bag-of-words representation.
    """
    # Get the index of the movie in the DataFrame
    try:
        movie_index = new_df.index[new_df['title'] == movie][0]
    except IndexError:
        return "Movie not found in the dataset."

    # Calculate cosine similarities for all movies with the specified movie
    similarities = [
        (index, cosine_similarity(BOW_matrix[movie_index].reshape(1, -1), BOW_matrix[index].reshape(1, -1))[0][0])
        for index in range(n) if index != movie_index
    ]
    
    # Sort by similarity in descending order
    similarities = sorted(similarities, reverse=True, key=lambda elem: elem[1])

    # Get the top 5 most similar movies
    recommended_indices = [index for index, similarity in similarities[:5]]
    recommended_titles = new_df.iloc[recommended_indices]["title"].tolist()

    return recommended_titles

# Testing the function for 
print(recommend("Avatar"))


['Aliens', 'Falcon Rising', 'Independence Day', 'Titan A.E.', 'Aliens vs Predator: Requiem']


In [None]:
# Recommandation system during the initialization






