# Sistema de Recomendação baseado em Conteúdo

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/metadata_clean1.csv', low_memory=False)
df.head()

Unnamed: 0,id,title,genres,runtime,vote_average,vote_count,overview,year
0,862,Toy Story,"['animation', 'comedy', 'family']",81.0,7.7,5415.0,"Led by Woody, Andy's toys live happily in his ...",1995
1,8844,Jumanji,"['adventure', 'fantasy', 'family']",104.0,6.9,2413.0,When siblings Judy and Peter discover an encha...,1995
2,15602,Grumpier Old Men,"['romance', 'comedy']",101.0,6.5,92.0,A family wedding reignites the ancient feud be...,1995
3,31357,Waiting to Exhale,"['comedy', 'drama', 'romance']",127.0,6.1,34.0,"Cheated on, mistreated and stepped on, the wom...",1995
4,11862,Father of the Bride Part II,['comedy'],106.0,5.7,173.0,Just when George Banks has recovered from his ...,1995


In [3]:
from ast import  literal_eval
print(type(literal_eval(df.iloc[0]['genres'])))

<class 'list'>


In [4]:
df['genres'] = df['genres'].apply(literal_eval)
s = df.apply(lambda x: pd.Series(x['genres']), axis=1).stack().reset_index(level=1, drop=True)
s.name='genre'
gen_df = df.drop(columns=['genres']).join(s)
gen_df.head()



Unnamed: 0,id,title,runtime,vote_average,vote_count,overview,year,genre
0,862,Toy Story,81.0,7.7,5415.0,"Led by Woody, Andy's toys live happily in his ...",1995,animation
0,862,Toy Story,81.0,7.7,5415.0,"Led by Woody, Andy's toys live happily in his ...",1995,comedy
0,862,Toy Story,81.0,7.7,5415.0,"Led by Woody, Andy's toys live happily in his ...",1995,family
1,8844,Jumanji,104.0,6.9,2413.0,When siblings Judy and Peter discover an encha...,1995,adventure
1,8844,Jumanji,104.0,6.9,2413.0,When siblings Judy and Peter discover an encha...,1995,fantasy


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')

df['overview'] = df['overview'].fillna('')

tfidf_matrix = tfidf.fit_transform(df['overview'])

tfidf_matrix.shape

(2000, 13918)

In [6]:
from sklearn.metrics.pairwise import linear_kernel

# Computa a similaridade do coseno
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [7]:
# Contruir um mapeamento dos indices e titulos de filmes
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [8]:
def content_recommender(title, cosine_sm=cosine_sim, df=df, indices=indices):
    
    # Obtem o indice do filme do titulo encontrado
    idx = indices[title]

    # Obtem a similaridade de todos os filmes com a entrada
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Ordena baseado no score da similaridade do cosseno
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Obtem scores dos 10 filmes mais similares
    sim_scores = sim_scores[1:11]

    # Obtem os indices
    movie_indices = [i[0] for i in sim_scores]

    # retorna os dez mais similares
    return df['title'].iloc[movie_indices]

In [9]:
content_recommender('The Lion King')

892               The Wizard of Oz
1741                Prince Valiant
42                     Restoration
1772             Quest for Camelot
515      Robin Hood: Men in Tights
1534            Kull the Conqueror
110            Rumble in the Bronx
1783              A Perfect Murder
697                        Flipper
89      The Journey of August King
Name: title, dtype: object

## Recomendação baseado em metadados

In [10]:
cred_df = pd.read_csv('data/credits.zip')
key_df = pd.read_csv('data/keywords.csv')

In [11]:
print(cred_df.shape)
cred_df.head(2)

(45476, 3)


Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844


In [12]:
print(key_df.shape)
key_df.head(2)

(46419, 2)


Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."


In [13]:
def clean_ids(x):
    try:
        return int(x)
    except:
        return np.nan

df['id'] = df['id'].apply(clean_ids)
df = df[df['id'].notnull()]


In [14]:
# Converter os ids para inteiro
df['id'] = df['id'].astype('int')
key_df['id'] = key_df['id'].astype('int')
cred_df['id'] = cred_df['id'].astype('int')

# Merge de dados
df = df.merge(cred_df, on='id')
df = df.merge(key_df, on='id')

df.head(3)

Unnamed: 0,id,title,genres,runtime,vote_average,vote_count,overview,year,cast,crew,keywords
0,862,Toy Story,"[animation, comedy, family]",81.0,7.7,5415.0,"Led by Woody, Andy's toys live happily in his ...",1995,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,Jumanji,"[adventure, fantasy, family]",104.0,6.9,2413.0,When siblings Judy and Peter discover an encha...,1995,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,Grumpier Old Men,"[romance, comedy]",101.0,6.5,92.0,A family wedding reignites the ancient feud be...,1995,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."


In [15]:
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']

for feature in features:
    df[feature] = df[feature].apply(str)
    df[feature] = df[feature].apply(literal_eval)

In [16]:
df.iloc[0]['crew'][0]

{'credit_id': '52fe4284c3a36847f8024f49',
 'department': 'Directing',
 'gender': 2,
 'id': 7879,
 'job': 'Director',
 'name': 'John Lasseter',
 'profile_path': '/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg'}

In [17]:
def get_director(x):
    for crew_member in x:
        if crew_member['job'] == 'Director':
            return crew_member['name']
    return np.nan

In [18]:
df['director'] = df['crew'].apply(get_director)

In [19]:
df['director'].head(3)

0    John Lasseter
1     Joe Johnston
2    Howard Deutch
Name: director, dtype: object

In [20]:
def generate_list(x):
    if isinstance(x, list):
        names = [i['name']for i in x]
        if len(names) > 3:
            names = names[:3]
        return names
    return []

In [21]:
df['cast'] = df['cast'].apply(generate_list)
df['keywords'] = df['keywords'].apply(generate_list)

In [22]:
df['genres'] = df['genres'].apply(lambda x: x[:3])

In [23]:
df[['title', 'cast', 'director', 'keywords', 'genres']].head(3)

Unnamed: 0,title,cast,director,keywords,genres
0,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles]",John Lasseter,"[jealousy, toy, boy]","[animation, comedy, family]"
1,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",Joe Johnston,"[board game, disappearance, based on children'...","[adventure, fantasy, family]"
2,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret]",Howard Deutch,"[fishing, best friend, duringcreditsstinger]","[romance, comedy]"


In [24]:
# remove espaços e converte em lower case
def sanitize(x):
    if isinstance(x, list):
        return [str.lower(i.replace(' ', '')) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(' ', ''))
        else:
            return ''

for feature in ['cast', 'director', 'genres', 'keywords']:
    df[feature] = df[feature].apply(sanitize)

In [25]:
df.head(1)

Unnamed: 0,id,title,genres,runtime,vote_average,vote_count,overview,year,cast,crew,keywords,director
0,862,Toy Story,"[animation, comedy, family]",81.0,7.7,5415.0,"Led by Woody, Andy's toys live happily in his ...",1995,"[tomhanks, timallen, donrickles]","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[jealousy, toy, boy]",johnlasseter


In [26]:
# junta todas string como um texto
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + ' '.join(x['director']) + ' ' + ' '.join(x['genres'])

df['soup'] = df.apply(create_soup, axis=1)
df.iloc[0]['soup']

'jealousy toy boy tomhanks timallen donrickles j o h n l a s s e t e r animation comedy family'

In [27]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['soup'])

In [28]:
from sklearn.metrics.pairwise import cosine_similarity

# Computa a similaridade cosseno (equivalente ao produto  to dot product for tf-idf vectors)
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [29]:
df = df.reset_index()
indices2 = pd.Series(df.index, index=df['title'])

In [30]:
content_recommender('The Lion King', cosine_sim2, df, indices2)

892                  The Apartment
1741        The Last Days of Disco
42                     Restoration
1772                     Black Dog
515      Robin Hood: Men in Tights
1534         In the Company of Men
110            Rumble in the Bronx
1783                     Lawn Dogs
697                      The Quest
89      The Journey of August King
Name: title, dtype: object