# Sistema de Recomendaçã baseado em Conhecimento

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('data/movies_metadata.csv', low_memory=False).head(2000)
df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [3]:
df = df[['id', 'title', 'genres', 'release_date', 'runtime', 'vote_average', 'vote_count', 'overview']]
df.head()

Unnamed: 0,id,title,genres,release_date,runtime,vote_average,vote_count,overview
0,862,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",1995-10-30,81.0,7.7,5415.0,"Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",1995-12-15,104.0,6.9,2413.0,When siblings Judy and Peter discover an encha...
2,15602,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",1995-12-22,101.0,6.5,92.0,A family wedding reignites the ancient feud be...
3,31357,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",1995-12-22,127.0,6.1,34.0,"Cheated on, mistreated and stepped on, the wom..."
4,11862,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",1995-02-10,106.0,5.7,173.0,Just when George Banks has recovered from his ...


In [4]:
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
df['year'] = df['release_date'].apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [5]:
def convert_int(x):
    try:
        return int(x)
    except:
        return 0

df['year'] = df['year'].apply(convert_int)
df.drop(columns=['release_date'], inplace=True)
df.head(3)

Unnamed: 0,id,title,genres,runtime,vote_average,vote_count,overview,year
0,862,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",81.0,7.7,5415.0,"Led by Woody, Andy's toys live happily in his ...",1995
1,8844,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",104.0,6.9,2413.0,When siblings Judy and Peter discover an encha...,1995
2,15602,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",101.0,6.5,92.0,A family wedding reignites the ancient feud be...,1995


In [6]:
df.iloc[0]['genres']

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [7]:
from ast import literal_eval

a= '[1,2,3]'
print(type(a))

b = literal_eval(a)
print(type(b))

<class 'str'>
<class 'list'>


In [8]:
df['genres'] = df['genres'].fillna('[]')
df['genres'] = df['genres'].apply(literal_eval)
df['genres'] = df['genres'].apply(lambda x: [i['name'].lower() for i in x] if isinstance(x, list) else [])
df.head(1)

Unnamed: 0,id,title,genres,runtime,vote_average,vote_count,overview,year
0,862,Toy Story,"[animation, comedy, family]",81.0,7.7,5415.0,"Led by Woody, Andy's toys live happily in his ...",1995


In [9]:
print(type(df.iloc[0]['genres']))

<class 'list'>


In [10]:
s = df.apply(lambda x: pd.Series(x['genres']), axis=1).stack().reset_index(level=1, drop=True)
s.name='genre'
gen_df = df.drop(columns=['genres']).join(s)
gen_df.head()



Unnamed: 0,id,title,runtime,vote_average,vote_count,overview,year,genre
0,862,Toy Story,81.0,7.7,5415.0,"Led by Woody, Andy's toys live happily in his ...",1995,animation
0,862,Toy Story,81.0,7.7,5415.0,"Led by Woody, Andy's toys live happily in his ...",1995,comedy
0,862,Toy Story,81.0,7.7,5415.0,"Led by Woody, Andy's toys live happily in his ...",1995,family
1,8844,Jumanji,104.0,6.9,2413.0,When siblings Judy and Peter discover an encha...,1995,adventure
1,8844,Jumanji,104.0,6.9,2413.0,When siblings Judy and Peter discover an encha...,1995,fantasy


In [11]:
def build_chart(gen_df, percentil=0.8):
    print('Entre com gênereo preferido: ')
    genre = input()

    print('Entre com a menor duração: ')
    low_time = int(input())

    print('Entre com a maior duração: ')
    height_time = int(input())

    print('Entre com o menor ano: ')
    low_year = int(input())

    print('Entre com o maior duração: ')
    heigth_year = int(input())

    movies = gen_df.copy()

    movies = movies[(movies['genre'] == genre) &
                    (movies['runtime'] >= low_time) & 
                    (movies['runtime'] <= height_time) &
                    (movies['year'] >= low_year) & 
                    (movies['year'] <= heigth_year)]

    C = movies['vote_average'].mean()
    m = movies['vote_count'].quantile(percentil)

    q_movies = movies.copy().loc[movies['vote_count'] >= m]
    q_movies['score'] = q_movies.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count'])+C), axis=1)

    q_movies = q_movies.sort_values('score', ascending=False)
    return q_movies

In [12]:
build_chart(gen_df).head()

Entre com gênereo preferido: 
Entre com a menor duração: 
Entre com a maior duração: 
Entre com o menor ano: 
Entre com o maior duração: 


Unnamed: 0,id,title,runtime,vote_average,vote_count,overview,year,genre,score
359,8587,The Lion King,89.0,8.0,5520.0,A young lion cub named Simba can't wait to be ...,1994,animation,13.129869
0,862,Toy Story,81.0,7.7,5415.0,"Led by Woody, Andy's toys live happily in his ...",1995,animation,12.875947
581,812,Aladdin,90.0,7.4,3495.0,Princess Jasmine grows tired of being forced t...,1992,animation,12.088437
588,10020,Beauty and the Beast,84.0,7.5,3029.0,"Follow the adventures of Belle, a bright young...",1991,animation,11.948857
546,9479,The Nightmare Before Christmas,76.0,7.6,2135.0,Tired of scaring humans every October 31 with ...,1993,animation,11.464019


In [13]:
df.to_csv('data/metadata_clean1.csv', index=False)