In [39]:
# imports

import os
import requests
import zipfile
import StringIO

In [40]:
# Constants
MOVIE_LENS_URL = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
MOVIE_LENS_DIR = 'movielens'

In [None]:
# Functions
def download_and_extract_zipfile(url, destdir):
    ''' Downloads zip file from url and extracts the contents to destdir '''
    r = requests.get(url)
    filehandle =  StringIO.StringIO(r.content)     # Convert string to file object
    z = zipfile.ZipFile(filehandle)                # Open file object as zip file   
    z.extractall(destdir)                          # Extract all zip data to destdir
    
# Download Movie Lens 1-M dataset
download_and_extract_zipfile(MOVIE_LENS_URL, MOVIE_LENS_DIR)    

"How many movie genres are in the Movie Lens dataset?"
--------------------------------------------------------------

In [41]:
# Movie record constants
MOVIE_ID     = 0
MOVIE_TITLE  = 1
MOVIE_GENRES = 2

In [4]:
# Count Genres Functions
def count_genres(path):
    ''' Returns a dictionary containing the number of movies per genre in file specified by path '''
    genres = {}

    for line in open(path):
        movie = line.strip().split('::')
        for genre in movie[MOVIE_GENRES].split('|'):
            genres[genre] = genres.get(genre, 0) + 1

    return genres

# Perform count on MovieLens 1M dataset
count_genres(os.path.join(MOVIE_LENS_DIR, 'ml-1m', 'movies.dat'))

{'Action': 503,
 'Adventure': 283,
 'Animation': 105,
 "Children's": 251,
 'Comedy': 1200,
 'Crime': 211,
 'Documentary': 127,
 'Drama': 1603,
 'Fantasy': 68,
 'Film-Noir': 44,
 'Horror': 343,
 'Musical': 114,
 'Mystery': 106,
 'Romance': 471,
 'Sci-Fi': 276,
 'Thriller': 492,
 'War': 143,
 'Western': 68}

Which movie has the least/most genres?
---------------------------------------------

In [5]:
def movie_genre(path):
    '''Returns a dictionary containing number of genres per movie in file specified by path'''
    movies = {}
    
    for line in open(path):
        movie = line.strip().split('::')
        movies[movie[MOVIE_TITLE]] = len(movie[MOVIE_GENRES].split('|'))
    
    return movies
    
# Get movies title with no. of genres on MovieLens 1M dataset
genres_per_movie = movie_genre(os.path.join(MOVIE_LENS_DIR, 'ml-1m', 'movies.dat'))   
# Movie with most genres
print sorted(genres_per_movie.items(), key=lambda x:x[1],reverse = True)[0]
# Movies with least genres
print sorted(genres_per_movie.items(), key=lambda x:x[1])[0]


('Transformers: The Movie, The (1986)', 6)
('To Cross the Rubicon (1991)', 1)


### Actually there are many movies with least/most genres.

In [6]:
# print movies with most genres
for title, genre in genres_per_movie.iteritems():
    if genre == max(genres_per_movie.items(), key=lambda x: x[1])[1]:
        print title

Transformers: The Movie, The (1986)


In [7]:
# print movies with least genres
for title, genre in genres_per_movie.iteritems():
    if genre == min(genres_per_movie.items(), key=lambda x: x[1])[1]:
        print title

To Cross the Rubicon (1991)
Birdcage, The (1996)
Crew, The (2000)
Black and White (1999)
Mummy's Ghost, The (1944)
Exorcist, The (1973)
Sabotage (1936)
Endless Summer 2, The (1994)
Careful (1992)
SubUrbia (1997)
Monument Ave. (1998)
Lodger, The (1926)
You Can't Take It With You (1938)
Mole People, The (1956)
Autumn Tale, An (Conte d'automne) (1998)
Jar, The (Khomreh) (1992)
Bye Bye, Love (1995)
Long Goodbye, The (1973)
2 Days in the Valley (1996)
Jerky Boys, The (1994)
Talk of Angels (1998)
Relax... It's Just Sex (1998)
Frances (1982)
Belizaire the Cajun (1986)
Mummy, The (1959)
Amadeus (1984)
Critical Care (1997)
Clockwork Orange, A (1971)
Mis�rables, Les (1998)
Snows of Kilimanjaro, The (1952)
Wonderland (1999)
Century of Cinema, A (1994)
Chairman of the Board (1998)
Taking of Pelham One Two Three, The (1974)
Pleasantville (1998)
From Russia with Love (1963)
Scent of a Woman (1992)
Harvey (1950)
Masque of the Red Death, The (1964)
Ogre, The (Der Unhold) (1996)
Risky Business (1983)
P

Which movie has the lowest/highest rating?
-------------------------------------------------

In [8]:
MOVIE_ID = 1
RATING = 2

def movie_rating(path):
    ratings = {}
    
    for line in open(path):
        rating = line.strip().split('::')        
        if rating[MOVIE_ID] not in ratings:
            ratings[rating[MOVIE_ID]] = int(rating[RATING])
        else:
            ratings[rating[MOVIE_ID]] += int(rating[RATING])
        
    return ratings

ratings = movie_rating(os.path.join(MOVIE_LENS_DIR, 'ml-1m', 'ratings.dat')) 
print ratings


{'1718': 18, '2031': 112, '1869': 118, '1868': 4, '854': 29, '344': 2412, '345': 1785, '346': 423, '347': 146, '340': 275, '341': 119, '342': 1905, '343': 102, '348': 1683, '349': 3946, '2318': 1249, '2316': 704, '2317': 21, '2314': 326, '2315': 348, '2312': 1267, '2313': 1834, '2310': 509, '2311': 1606, '298': 90, '299': 524, '296': 9288, '297': 87, '294': 130, '295': 80, '292': 2396, '293': 3790, '290': 658, '291': 106, '3773': 482, '3772': 5, '3771': 513, '3770': 1173, '3777': 24, '3776': 80, '3775': 65, '3774': 126, '2147': 589, '3779': 3, '3778': 17, '270': 240, '271': 198, '272': 1427, '273': 707, '274': 57, '275': 203, '276': 614, '277': 541, '278': 110, '279': 171, '2814': 21, '3211': 232, '1780': 33, '2268': 4171, '2269': 594, '1132': 937, '2262': 1029, '2263': 543, '2260': 137, '2261': 536, '2266': 560, '2267': 218, '2264': 215, '2265': 166, '2442': 612, '2443': 453, '2440': 158, '1130': 753, '2446': 160, '2447': 814, '2444': 20, '1004': 269, '2448': 444, '2449': 49, '108': 2

In [42]:
# Get movie name by ID
def movie_name(path,movie_ID):
    for line in open(path):
        movie = line.strip().split('::')
        if movie[0]==movie_ID:
            return movie[1]

In [10]:
# Get movie ID with maximum ratings
for movie_ID, rating in ratings.iteritems():
    if rating == max(ratings.items(), key=lambda x: x[1])[1]:
        name = movie_name(os.path.join(MOVIE_LENS_DIR, 'ml-1m', 'movies.dat'),movie_ID)
        print name

American Beauty (1999)


In [11]:
# Get movie ID with minimum ratings
for movie_ID, rating in ratings.iteritems():
    if rating == min(ratings.items(), key=lambda x: x[1])[1]:
        name = movie_name(os.path.join(MOVIE_LENS_DIR, 'ml-1m', 'movies.dat'),movie_ID)
        print name

Loves of Carmen, The (1948)
Terror in a Texas Town (1958)
Even Dwarfs Started Small (Auch Zwerge haben klein angefangen) (1971)
White Boys (1999)
Waltzes from Vienna (1933)
Elstree Calling (1930)
Silence of the Palace, The (Saimt el Qusur) (1994)
McCullochs, The (1975)
Shadows (Cienie) (1988)
Underworld (1997)
Diebinnen (1995)
Uninvited Guest, An (2000)
Windows (1980)
Sleepover (1995)
Little Indian, Big City (Un indien dans la ville) (1994)
Cheetah (1989)
Kestrel's Eye (Falkens �ga) (1998)
Mutters Courage (1995)
Lotto Land (1995)
Hillbillys in a Haunted House (1967)
Nueba Yol (1995)
Fantastic Night, The (La Nuit Fantastique) (1949)
Low Life, The (1994)
Bloody Child, The (1996)


Which animated movie has the lowest/highest rating?
------------------------------------------------------------
First need to get animated movies using movies.dat file. Then get it's rating from ratings dictionary & find the movie with lowest/highest rating.

In [12]:
# Movie record constants
MOVIE_ID     = 0
MOVIE_TITLE  = 1
MOVIE_GENRES = 2

def movies_of_genre(path,movie_genre):
    ''' Returns a dictionary containing the movies with specified genre in file specified by path '''
    genre_movie = {}

    for line in open(path):
        movie = line.strip().split('::')
        for genre in movie[MOVIE_GENRES].split('|'):
            if genre == movie_genre:
                genre_movie[movie[MOVIE_ID]] = movie[MOVIE_TITLE]

    return genre_movie

Animation_movies = movies_of_genre(os.path.join(MOVIE_LENS_DIR, 'ml-1m', 'movies.dat'),'Animation')
print Animation_movies

{'3611': 'Saludos Amigos (1943)', '1148': 'Wrong Trousers, The (1993)', '661': 'James and the Giant Peach (1996)', '2096': 'Sleeping Beauty (1959)', '2090': 'Rescuers, The (1977)', '2092': 'Return of Jafar, The (1993)', '2099': 'Song of the South (1946)', '3799': 'Pok\xe9mon the Movie 2000 (2000)', '2018': 'Bambi (1942)', '3483': 'Road to El Dorado, The (2000)', '1151': 'Faust (1994)', '2700': 'South Park: Bigger, Longer and Uncut (1999)', '2857': 'Yellow Submarine (1968)', '673': 'Space Jam (1996)', '3400': "We're Back! A Dinosaur's Story (1993)", '594': 'Snow White and the Seven Dwarfs (1937)', '3034': 'Robin Hood (1973)', '2559': 'King and I, The (1999)', '2495': 'Fantastic Planet, The (La Plan\xe8te sauvage) (1973)', '1064': 'Aladdin and the King of Thieves (1996)', '2394': 'Prince of Egypt, The (1998)', '2142': 'American Tail: Fievel Goes West, An (1991)', '2141': 'American Tail, An (1986)', '595': 'Beauty and the Beast (1991)', '709': 'Oliver & Company (1988)', '3775': 'Make Mine

In [13]:
def ratings_of_movies_of_genre(path,genre,genre_movies):    
    keys_ratings = set(ratings.keys())
    keys_genre = set(genre_movies.keys())
    intersection = keys_ratings & keys_genre

    genre_ratings = {}
    for movie_ID in intersection:
        genre_ratings[movie_ID] = ratings[movie_ID] 
    
    return genre_ratings

In [14]:
def hightest_rated_movie_of_genre(path,genre):
    genre_movies = movies_of_genre(path,genre)
    genre_ratings = ratings_of_movies_of_genre(path,genre,genre_movies)
    for movie_ID, rating in genre_ratings.iteritems():
        if rating == max(genre_ratings.items(), key=lambda x: x[1])[1]:
            name = genre_movies[movie_ID]             
    return name

In [15]:
def lowest_rated_movie_of_genre(path,genre):
    genre_movies = movies_of_genre(path,genre)
    genre_ratings = ratings_of_movies_of_genre(path,genre,genre_movies)
    for movie_ID, rating in genre_ratings.iteritems():
        if rating == min(genre_ratings.items(), key=lambda x: x[1])[1]:
            name = genre_movies[movie_ID]  
    return name

In [16]:
name = hightest_rated_movie_of_genre(os.path.join(MOVIE_LENS_DIR, 'ml-1m', 'movies.dat'),'Animation')
print "Highest rated movie in Animation genre is " + name
name = lowest_rated_movie_of_genre(os.path.join(MOVIE_LENS_DIR, 'ml-1m', 'movies.dat'),'Animation')        
print "Lowest rated movie in Animation genre is " + name

Highest rated movie in Animation genre is Toy Story (1995)
Lowest rated movie in Animation genre is Gumby: The Movie (1995)


Which action movie has the lowest/highest rating?
------------------------------------------------------------

In [17]:
name = hightest_rated_movie_of_genre(os.path.join(MOVIE_LENS_DIR, 'ml-1m', 'movies.dat'),'Action')
print "Highest rated movie in Action genre is " + name
name = lowest_rated_movie_of_genre(os.path.join(MOVIE_LENS_DIR, 'ml-1m', 'movies.dat'),'Action')        
print "Lowest rated movie in Action genre is " + name

Highest rated movie in Action genre is Star Wars: Episode IV - A New Hope (1977)
Lowest rated movie in Action genre is Detroit 9000 (1973)


Which sci-fi movie from the 1990's is rated the highest?
---------------------------------------------------------

In [65]:
def movies_of_decade(movies,decade):
    decade_start = int(decade)
    decade_end = decade_start + 9

    movies_of_decade = {}
    for movie_ID,title in movies.items():
        movie_year = int(title.split(' ')[-1].strip('(, )'))        
        if (movie_year >= decade_start) & (movie_year <= decade_end):
            movies_of_decade[movie_ID]=title

    return movies_of_decade
    

In [66]:
def ratings_of_movies_of_decade(decade_movies):    
    keys_ratings = set(ratings.keys())
    keys_decade = set(decade_movies.keys())
    intersection = keys_ratings & keys_decade

    decade_ratings = {}
    for movie_ID in intersection:
        decade_ratings[movie_ID] = ratings[movie_ID] 
    
    return decade_ratings

In [67]:
def hightest_rated_movie_of_decade(decade_movies):
    decade_ratings = ratings_of_movies_of_decade(decade_movies)
    for movie_ID, rating in decade_ratings.iteritems():
        if rating == max(decade_ratings.items(), key=lambda x: x[1])[1]:
            name = decade_movies[movie_ID]             
    return name

In [70]:
def lowest_rated_movie_of_decade(decade_movies):
    decade_ratings = ratings_of_movies_of_decade(decade_movies)
    for movie_ID, rating in decade_ratings.iteritems():
        if rating == min(decade_ratings.items(), key=lambda x: x[1])[1]:
            name = decade_movies[movie_ID]             
    return name

In [68]:
# Sci-Fi movies 
Scifi_movies = movies_of_genre(os.path.join(MOVIE_LENS_DIR, 'ml-1m', 'movies.dat'),'Sci-Fi')
print Scifi_movies

{'3593': 'Battlefield Earth (2000)', '1306': 'Until the End of the World (Bis ans Ende der Welt) (1991)', '2658': 'Flying Saucer, The (1950)', '3926': 'Voyage to the Bottom of the Sea (1961)', '1301': 'Forbidden Planet (1956)', '3758': 'Communion (1989)', '3699': 'Starman (1984)', '3648': 'Abominable Snowman, The (1957)', '2657': 'Rocky Horror Picture Show, The (1975)', '2656': 'Tarantula (1955)', '3878': 'X: The Unknown (1956)', '2094': 'Rocketeer, The (1991)', '2091': 'Return from Witch Mountain (1978)', '3573': 'Carnosaur 2 (1995)', '2093': 'Return to Oz (1985)', '3572': 'Carnosaur (1993)', '3927': 'Fantastic Voyage (1966)', '24': 'Powder (1995)', '2322': 'Soldier (1998)', '1544': 'Lost World: Jurassic Park, The (1997)', '1831': 'Lost in Space (1998)', '2525': 'Alligator (1980)', '1077': 'Sleeper (1973)', '1779': 'Sphere (1998)', '29': 'City of Lost Children, The (1995)', '3793': 'X-Men (2000)', '2009': 'Soylent Green (1973)', '1371': 'Star Trek: The Motion Picture (1979)', '2407': 

In [71]:
#Sci-fi movies of decade 1990
Sci_fi_1990 = movies_of_decade(Scifi_movies,'1990')
# Highest rated Sci-fi movie of decade 1990
name = hightest_rated_movie_of_decade(Sci_fi_1990)
print "Highest rated movie in of 90's is " + name
# Lowest rated Sci-fi movie of decade 1990
name = lowest_rated_movie_of_decade(Sci_fi_1990)
print "Lowest rated movie in of 90's is " + name

Highest rated movie in of 90's is Matrix, The (1999)
Lowest rated movie in of 90's is Nemesis 2: Nebula (1995)


In [72]:

def movies_of_year(movies,year):
    movies_of_year = {}
    for movie_ID,title in movies.items():
        movie_year = int(title.split(' ')[-1].strip('(, )'))        
        if (movie_year == int(year)):
            movies_of_year[movie_ID]=title

    return movies_of_year

How many fantasy movies from the year 1998 were rated?
-------------------------------------------------------

In [74]:
# Sci-Fi movies 
Fantasy_movies = movies_of_genre(os.path.join(MOVIE_LENS_DIR, 'ml-1m', 'movies.dat'),'Fantasy')
print Fantasy_movies

{'3489': 'Hook (1991)', '3920': 'Faraway, So Close (In Weiter Ferne, So Nah!) (1993)', '558': 'Pagemaster, The (1994)', '885': 'Bogus (1996)', '2086': 'One Magic Christmas (1985)', '317': 'Santa Clause, The (1994)', '2968': 'Time Bandits (1981)', '1126': 'Drop Dead Fred (1991)', '3440': 'Teenage Mutant Ninja Turtles III (1993)', '60': 'Indian in the Cupboard, The (1995)', '258': "Kid in King Arthur's Court, A (1995)", '2193': 'Willow (1988)', '2093': 'Return to Oz (1985)', '2100': 'Splash (1984)', '3438': 'Teenage Mutant Ninja Turtles (1990)', '3439': 'Teenage Mutant Ninja Turtles II: The Secret of the Ooze (1991)', '3877': 'Supergirl (1984)', '792': 'Hungarian Fairy Tale, A (1987)', '2173': 'Navigator: A Mediaeval Odyssey, The (1988)', '1583': 'Simple Wish, A (1997)', '1967': 'Labyrinth (1986)', '1073': 'Willy Wonka and the Chocolate Factory (1971)', '2138': 'Watership Down (1978)', '1525': 'Warriors of Virtue (1997)', '2': 'Jumanji (1995)', '1019': '20,000 Leagues Under the Sea (1954

In [75]:
Fantasy_movie_1998 = movies_of_year(Fantasy_movies,'1998')

In [76]:
# No. of rated movies of 1998
Fantasy_movies_rated_1998 = len(Fantasy_movie_1998)
print Fantasy_movies_rated_1998

2


What movies do programmers enjoy the most?
-------------------------------------------

In [25]:
def users_by_occupation(path,occupation):
    '''Returns a dictionary containing number of genres per movie in file specified by path'''
    user_ID = []
    
    for line in open(path):
        users = line.strip().split('::')
        if users[3] == occupation:
            user_ID.append(users[0])
    
    return user_ID

programmers = users_by_occupation(os.path.join(MOVIE_LENS_DIR, 'ml-1m', 'users.dat'),'12')
print len(programmers)

388


In [55]:
# Get movie_IDs for programmer
def movie_ID_per_occupation(path,users_by_occupation):
    user_movie_ID = []
    for line in open(path):
        ratings = line.strip().split('::')
        if ratings[0] in users_by_occupation:
            user_movie_ID.append(ratings[1])
    return user_movie_ID
                  
programmer_movie_ID = movie_ID_per_occupation(os.path.join(MOVIE_LENS_DIR, 'ml-1m', 'ratings.dat'),programmers)

57214


In [57]:
def movie_ID_count_per_occupation(user_movie_ID):
    movie_ID_count = dict()
    for i in user_movie_ID:
        if i not in movie_ID_count:
            movie_ID_count[i] = 1
        else:
            movie_ID_count[i] = movie_ID_count[i] + 1
    return movie_ID_count

movie_ID_count = movie_ID_count_per_occupation(programmer_movie_ID)

In [59]:
def top_movie_per_occupation(movie_ID_count):
    top_movies = sorted(movie_ID_count.items(), key=lambda x:x[1],reverse = True)[:5]
    for x,y in top_movies:  
        name = movie_name(os.path.join(MOVIE_LENS_DIR, 'ml-1m', 'movies.dat'),x)
        print name
        
top_movie_per_occupation(movie_ID_count)

Star Wars: Episode V - The Empire Strikes Back (1980)
Star Wars: Episode VI - Return of the Jedi (1983)
Terminator 2: Judgment Day (1991)
Star Wars: Episode IV - A New Hope (1977)
American Beauty (1999)
