In [1]:
import numpy as np
import pandas as pd

In [2]:
pd.options.display.max_rows = 10

unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table('datasets/movielens/users.dat', sep='::',
                      header=None, names=unames)

rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('datasets/movielens/ratings.dat', sep='::',
                        header=None, names=rnames)

mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('datasets/movielens/movies.dat', sep='::',
                       header=None, names=mnames)

  """
  if __name__ == '__main__':
  del sys.path[0]


In [3]:
data = pd.merge(pd.merge(ratings, users), movies)
data

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),Drama
...,...,...,...,...,...,...,...,...,...,...
1000204,5949,2198,5,958846401,M,18,17,47901,Modulations (1998),Documentary
1000205,5675,2703,3,976029116,M,35,14,30030,Broken Vessels (1998),Drama
1000206,5780,2845,1,958153068,M,18,17,92886,White Boys (1999),Drama
1000207,5851,3607,5,957756608,F,18,20,55410,One Little Indian (1973),Comedy|Drama|Western


In [4]:
mean_ratings = data.pivot_table('rating', index='title', aggfunc='mean')
mean_ratings

Unnamed: 0_level_0,rating
title,Unnamed: 1_level_1
"$1,000,000 Duck (1971)",3.027027
'Night Mother (1986),3.371429
'Til There Was You (1997),2.692308
"'burbs, The (1989)",2.910891
...And Justice for All (1979),3.713568
...,...
"Zed & Two Noughts, A (1985)",3.413793
Zero Effect (1998),3.750831
Zero Kelvin (Kj�rlighetens kj�tere) (1995),3.500000
Zeus and Roxanne (1997),2.521739


In [5]:
ratings_by_title = data.groupby('title').size()
ratings_by_title

title
$1,000,000 Duck (1971)                         37
'Night Mother (1986)                           70
'Til There Was You (1997)                      52
'burbs, The (1989)                            303
...And Justice for All (1979)                 199
                                             ... 
Zed & Two Noughts, A (1985)                    29
Zero Effect (1998)                            301
Zero Kelvin (Kj�rlighetens kj�tere) (1995)      2
Zeus and Roxanne (1997)                        23
eXistenZ (1999)                               410
Length: 3706, dtype: int64

In [6]:
ratings_by_title[:10]
active_titles = ratings_by_title.index[ratings_by_title >= 300]
active_titles

Index([''burbs, The (1989)', '10 Things I Hate About You (1999)',
       '101 Dalmatians (1961)', '101 Dalmatians (1996)', '12 Angry Men (1957)',
       '13th Warrior, The (1999)', '20,000 Leagues Under the Sea (1954)',
       '2001: A Space Odyssey (1968)', '2010 (1984)', '28 Days (2000)',
       ...
       'X-Men (2000)', 'Year of Living Dangerously (1982)',
       'Yellow Submarine (1968)', 'You've Got Mail (1998)',
       'Young Frankenstein (1974)', 'Young Guns (1988)',
       'Young Guns II (1990)', 'Young Sherlock Holmes (1985)',
       'Zero Effect (1998)', 'eXistenZ (1999)'],
      dtype='object', name='title', length=1058)

In [7]:
mean_ratings = mean_ratings.loc[active_titles]
mean_ratings

Unnamed: 0_level_0,rating
title,Unnamed: 1_level_1
"'burbs, The (1989)",2.910891
10 Things I Hate About You (1999),3.422857
101 Dalmatians (1961),3.596460
101 Dalmatians (1996),3.046703
12 Angry Men (1957),4.295455
...,...
Young Guns (1988),3.418149
Young Guns II (1990),2.907859
Young Sherlock Holmes (1985),3.390501
Zero Effect (1998),3.750831


### 1. Os 10 filmes melhor classificados (> 300 avaliações)

In [8]:
top_ratings = mean_ratings.sort_values(by='rating', ascending=False)
top_ratings[:10]

Unnamed: 0_level_0,rating
title,Unnamed: 1_level_1
Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954),4.56051
"Shawshank Redemption, The (1994)",4.554558
"Godfather, The (1972)",4.524966
"Close Shave, A (1995)",4.520548
"Usual Suspects, The (1995)",4.517106
Schindler's List (1993),4.510417
"Wrong Trousers, The (1993)",4.507937
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),4.491489
Raiders of the Lost Ark (1981),4.477725
Rear Window (1954),4.47619


### 2. Os 10 filmes pior classificados (> 300 avaliações)

In [9]:
last_ratings = mean_ratings.sort_values(by='rating')
last_ratings[:10]

Unnamed: 0_level_0,rating
title,Unnamed: 1_level_1
Battlefield Earth (2000),1.611111
Speed 2: Cruise Control (1997),1.871935
Super Mario Bros. (1993),1.874286
Superman IV: The Quest for Peace (1987),1.888554
Grease 2 (1982),1.963696
Barb Wire (1996),2.03
Howard the Duck (1986),2.099078
Wild Wild West (1999),2.158537
Inspector Gadget (1999),2.16622
Anaconda (1997),2.200501


In [10]:
counter = data.pivot_table('movie_id', index='genres', aggfunc='count')
counter

Unnamed: 0_level_0,movie_id
genres,Unnamed: 1_level_1
Action,12311
Action|Adventure,10446
Action|Adventure|Animation,345
Action|Adventure|Animation|Children's|Fantasy,135
Action|Adventure|Animation|Horror|Sci-Fi,618
...,...
Sci-Fi|Thriller|War,280
Sci-Fi|War,1367
Thriller,17851
War,991


In [11]:
get_genres = lambda x: np.char.split(x, '|').item(0)[0]
data_genres = data.genres.copy()
data_genres

0                         Drama
1                         Drama
2                         Drama
3                         Drama
4                         Drama
                   ...         
1000204             Documentary
1000205                   Drama
1000206                   Drama
1000207    Comedy|Drama|Western
1000208             Documentary
Name: genres, Length: 1000209, dtype: object

In [12]:
splited_genres = data_genres.apply(get_genres)
splited_genres

0                Drama
1                Drama
2                Drama
3                Drama
4                Drama
              ...     
1000204    Documentary
1000205          Drama
1000206          Drama
1000207         Comedy
1000208    Documentary
Name: genres, Length: 1000209, dtype: object

In [13]:
data['new_genres'] = splited_genres
data

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,new_genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama,Drama
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama,Drama
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama,Drama
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama,Drama
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),Drama,Drama
...,...,...,...,...,...,...,...,...,...,...,...
1000204,5949,2198,5,958846401,M,18,17,47901,Modulations (1998),Documentary,Documentary
1000205,5675,2703,3,976029116,M,35,14,30030,Broken Vessels (1998),Drama,Drama
1000206,5780,2845,1,958153068,M,18,17,92886,White Boys (1999),Drama,Drama
1000207,5851,3607,5,957756608,F,18,20,55410,One Little Indian (1973),Comedy|Drama|Western,Comedy


### 3. Quantidade de filmes por gênero

In [14]:
pd.set_option("max_rows", None)

counter_by_new_genres = data.pivot_table('genres', index='new_genres', aggfunc='count')
counter_by_new_genres

Unnamed: 0_level_0,genres
new_genres,Unnamed: 1_level_1
Action,257457
Adventure,43630
Animation,36936
Children's,21491
Comedy,276923
Crime,37849
Documentary,6817
Drama,208627
Fantasy,790
Film-Noir,9343


In [None]:
all_comedies = data[data['new_genres'] == 'Comedy']
all_comedies

In [None]:
mean_comedies_ratings = all_comedies.pivot_table('rating', index='title', aggfunc='mean')
mean_comedies_ratings

### 4. Os 10 filmes melhor classificados no genero comédia

In [None]:
top_comedy_ratings = mean_comedies_ratings.sort_values(by='rating', ascending=False)
top_comedy_ratings[:10]