### Movie Recommander System project

In [28]:
import pandas as pd 
import numpy as np

In [29]:
movies = pd.read_csv("dataset/movies.csv")
credits = pd.read_csv("dataset/credits.csv")

In [30]:
movies.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [31]:
credits.head(2)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [32]:
# Merging between movies dataset and credits one
movies = movies.merge(credits,left_on="id",right_on="movie_id")

In [33]:
# Select the key features for the project
movies = movies[["id","genres","keywords","title_x","overview","cast","crew"]].rename(columns={"title_x":"title"})

In [34]:
# Kicking out records with null value
movies.isnull().any()
movies = movies.dropna()

In [35]:
movies = movies.drop_duplicates()

In [36]:
# Converting Json strings in genres and keywords attributes to ordinary list

import json

def convert(json_ch):
    res = []
    tab = json.loads(json_ch)
    for elem in tab:
        res.append(elem["name"])
    return res

movies["genres"] = movies["genres"].apply(convert)
movies["keywords"] = movies["keywords"].apply(convert)

In [37]:
# Targeting cast attribute

def choose3(json_ch):
    res = []
    L = json.loads(json_ch)
    count = 0 
    for x in L:
        if count < 3 :
            res.append(x["name"])
            count+=1
        else:
            break
    return res

movies["cast"] = movies["cast"].apply(choose3)

In [38]:
# Targeting crew attribute

def choose_director(json_ch):
    res = []
    L  = json.loads(json_ch)
    for x in L : 
        if x["job"] == "Director":
            res.append(x["name"])
    return res

movies["crew"] = movies["crew"].apply(choose_director)

In [39]:
#Targeting overview attribute

def aslist(ch):
    return ch.split(" ")

movies["overview"] = movies["overview"].apply(aslist)

In [40]:
movies.head(5)

Unnamed: 0,id,genres,keywords,title,overview,cast,crew
0,19995,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...",Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...",Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,49026,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...",The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,49529,"[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...",John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


In [41]:
# Eliminating spaces in genres, keywords, cast and crew

def nospace(L):
    return [elem.replace(" ","") for elem in L]

for elem in ("genres" , "keywords" , "cast" , "crew"):
    movies[elem] = movies[elem].apply(nospace)

In [42]:
# Creating tag attribute & adding new dataframe
movies["tag"] = movies["overview"] + movies["genres"] + movies["keywords"] + movies["cast"] + movies["crew"]
new_df = movies[["id" , "title" , "tag"]]
new_df.head(5)

new_df["tag"] = new_df["tag"].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["tag"] = new_df["tag"].apply(lambda x:" ".join(x))


In [43]:
new_df["tag"] = new_df["tag"].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["tag"] = new_df["tag"].apply(lambda x: x.lower())


In [44]:
new_df.head()

Unnamed: 0,id,title,tag
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


In [45]:
new_df.iloc[0]["tag"]

'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d samworthington zoesaldana sigourneyweaver jamescameron'

In [46]:
import sys
import spacy

# Load the language model
nlp = spacy.load("en_core_web_sm")

def remove_stopwords(text):
    # Process the text
    doc = nlp(text)
    # Filter out stop words
    filtered_tokens = [token.text for token in doc if not token.is_stop]
    # Join the filtered tokens back into a string
    filtered_text = " ".join(filtered_tokens)
    return filtered_text

# Removing stopwords from tags 
new_df["tag"] = new_df["tag"].apply(remove_stopwords)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["tag"] = new_df["tag"].apply(remove_stopwords)


In [47]:
new_df.iloc[0]["tag"]

'22nd century , paraplegic marine dispatched moon pandora unique mission , torn following orders protecting alien civilization . action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d samworthington zoesaldana sigourneyweaver jamescameron'

In [48]:
import string

def remove_punctuation_and_extra_spaces(text):
    # Remove punctuation
    text_without_punctuation = text.translate(str.maketrans('', '', string.punctuation))
    # Remove extra spaces and return the cleaned text
    return ' '.join(text_without_punctuation.split())

new_df["tag"] = new_df["tag"].apply(remove_punctuation_and_extra_spaces)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["tag"] = new_df["tag"].apply(remove_punctuation_and_extra_spaces)


In [52]:
# Tokenization

def tokenize(text):
    doc = nlp(text)    
    tokens = [token.text for token in doc]
    return tokens

new_df["tag"] = new_df["tag"].apply(tokenize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["tag"] = new_df["tag"].apply(tokenize)


In [54]:
new_df["tag"].head(5)

0    [22nd, century, paraplegic, marine, dispatched...
1    [captain, barbossa, long, believed, dead, come...
2    [cryptic, message, bond, past, sends, trail, u...
3    [following, death, district, attorney, harvey,...
4    [john, carter, war, weary, military, captain, ...
Name: tag, dtype: object

In [55]:
# Normalization process with lemmatisation

def lemmatize(token):
    doc = nlp(" ".join(token))    
    lemmatized_tokens = [token.lemma_ for token in doc if token.lemma_ != ""]
    return lemmatized_tokens

new_df["tag"] = new_df["tag"].apply(lemmatize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["tag"] = new_df["tag"].apply(lemmatize)


In [56]:
new_df.iloc[0]["tag"]

['22nd',
 'century',
 'paraplegic',
 'marine',
 'dispatch',
 'moon',
 'pandora',
 'unique',
 'mission',
 'tear',
 'follow',
 'order',
 'protect',
 'alien',
 'civilization',
 'action',
 'adventure',
 'fantasy',
 'sciencefiction',
 'cultureclash',
 'future',
 'spacewar',
 'spacecolony',
 'society',
 'spacetravel',
 'futuristic',
 'romance',
 'space',
 'alien',
 'tribe',
 'alienplanet',
 'cgi',
 'marine',
 'soldier',
 'battle',
 'loveaffair',
 'antiwar',
 'powerrelation',
 'mindandsoul',
 '3d',
 'samworthington',
 'zoesaldana',
 'sigourneyweaver',
 'jamescameron']