In [1]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error

Load the ratings

In [2]:
path = os.path.join(os.getcwd(),"..","datasets", "ml1m", "ratings.dat")
ratings = pd.read_table(path, sep="::",  names=["UserID", 'MovieID', 'Rating', 'Timestamp'], header = None, engine='python').drop('Timestamp', axis=1)
ratings = ratings.pivot('UserID', 'MovieID', 'Rating').replace(0, np.nan).transpose().unstack().dropna()

Load the movie data

In [3]:
path = os.path.join(os.getcwd(), "..","datasets", "ml1m", "movies.dat")
movies = pd.read_table(path, sep="::",  names=["MovieID", 'Title', 'Genre'], header = None, engine='python')

Load the user data

In [4]:
path = os.path.join(os.getcwd(), "..","datasets", "ml1m", "users.dat")
users = pd.read_table(path, sep="::", names=["UserID", "Gender", "Age", "Occupation", 'Zip-code'], header = None, engine='python')

Functions for adding metadata

In [5]:
def add_gender_metadata(movies):
    genders = [[0,'Male',''],[0,'Female','']]
    mov_id = movies['MovieID'].max()
    for g in genders:
        mov_id = mov_id + 1
        g[0] = mov_id
    return movies.append(pd.DataFrame(genders, columns = ['MovieID', "Title","Genre"]), ignore_index=True)

def add_age_metadata(movies):
    ages = [[0,"Under 18", ""], [0,"18-24", ""], [0,"25-34", ""], [0,"35-44", ""], 
        [0,"45-49", ""], [0,"50-55", ""], [0,"56+", ""]]
    mov_id = movies['MovieID'].max()
    for a in ages:
        mov_id = mov_id + 1
        a[0] = mov_id
    return movies.append(pd.DataFrame(ages, columns = ['MovieID', "Title","Genre"]), ignore_index=True)

def add_gender_age_metadata(movies):
    gender_ages = [[0,"Male",""], [0,"Male - Under 18", ""], [0,"Male - 18-24", ""], [0,"Male - 25-34", ""], 
            [0,"Male - 35-44", ""], [0,"Male - 45-49", ""], [0,"Male - 50-55", ""], [0,"Male - 56+", ""],
            [0,"Female",""], [0,"Female - Under 18", ""], [0,"Female - 18-24", ""], [0,"Female - 25-34", ""], 
            [0,"Female - 35-44", ""], [0,"Female - 45-49", ""], [0,"Female - 50-55", ""], [0,"Female - 56+", ""]]

    mov_id = movies['MovieID'].max()

    for ma in gender_ages:
        mov_id = mov_id + 1
        ma[0] = mov_id
                  
    return movies.append(pd.DataFrame(gender_ages, columns = ['MovieID', "Title","Genre"]), ignore_index=True)

def get_gender_item(user_row):
    if(user_row["Gender"] == "M"):
        return "Male"
    elif(user_row["Gender"] == "F"):
        return "Female"

def get_age_item(user_row):
    if(user_row["Age"] == 1):
        return "Under 18"
    elif(user_row["Age"] == 18):
        return "18-24"
    elif(user_row["Age"] == 25):
        return "25-34"
    elif(user_row["Age"] == 35):
        return "35-44"
    elif(user_row["Age"] == 45):
        return "45-49"
    elif(user_row["Age"] == 50):
        return "50-55"
    elif(user_row["Age"] == 56):
        return "56+"
    else:
        return None

def get_gender_age_item(user_row):
    item = get_gender_item(user_row)
    if get_age_item(user_row):
        item += ' - ' + get_age_item(user_row)
    return item

Get a set of all the genres

In [7]:
genres = set()
for row in movies.iterrows():
    for genre in row[1]['Genre'].split('|'):
        genres.add(genre)

Create a corpus

In [62]:
include_metadata = 'gender-age' #gender/age/gender+age/gender-age
metadata_weight = 5.0

path = os.path.join(os.getcwd(), "..","datasets", "ml1m", "movies.dat")
movies = pd.read_table(path, sep="::",  names=["MovieID", 'Title', 'Genre'], header = None, engine='python')

if include_metadata == 'gender':
    movies = add_gender_metadata(movies)
elif include_metadata == 'age':
    movies = add_age_metadata(movies)
elif include_metadata == 'gender+age':
    movies = add_gender_metadata(movies)
    movies = add_age_metadata(movies)
elif include_metadata == 'gender-age':
    movies = add_gender_age_metadata(movies)

path = os.path.join(os.getcwd(),"..","datasets", "ml1m.dat")
fo = open(path, "w")

line = str(movies.shape[0]) + ' '
for movie in movies.iterrows():
    line += str(movie[0]) + ':' + str(1.0) + ' '
line += '\n'
fo.write(line)

for user in users.iterrows():
    user_index = user[0]
    user_id = user[1]['UserID']
    
    count = 0
    doc = ''
    
    if include_metadata == 'gender':
        item = movies[(movies["Title"] == get_gender_item(users.loc[user_index]))].index[0]
        doc  += str(item) + ":" + str(metadata_weight) + ' '
        count += 1
    elif include_metadata == 'age':
        item = movies[(movies["Title"] == get_age_item(users.loc[user_index]))].index[0]
        doc  += str(item) + ":" + str(metadata_weight) + ' '
        count += 1
    elif include_metadata == 'gender+age':
        item = movies[(movies["Title"] == get_gender_item(users.loc[user_index]))].index[0]
        doc  += str(item) + ":" + str(metadata_weight) + ' '
        item = movies[(movies["Title"] == get_age_item(users.loc[user_index]))].index[0]
        doc  += str(item) + ":" + str(metadata_weight) + ' '
        count += 2
    elif include_metadata == 'gender-age':
        item = movies[(movies["Title"] == get_gender_age_item(users.loc[user_index]))].index[0]
        doc  += str(item) + ":" + str(metadata_weight) + ' '
        count += 1
    
    user_ratings = ratings.loc[user_id]
    mean = user_ratings.mean()
    std = user_ratings.std()
    user_ratings = user_ratings.subtract(mean)
    if std != 0:
        user_ratings = user_ratings.divide(std)
    mini = np.abs(user_ratings.min()) + 1.0
    user_ratings = user_ratings.add(mini).round()
    
    for movie_id in user_ratings.index:
        movie_index = movies[movies["MovieID"] == movie_id].index[0]
        doc = doc + str(movie_index) + ':' + str(user_ratings.loc[movie_id]) + " "
        count += 1
    doc = str(count) + " " + doc + "\n"

    fo.write(doc)
fo.close()

Load beta

In [132]:
path = os.path.join(os.getcwd(), "..", "param", "ml1m", "beta.dat")
beta = pd.read_table(path, sep=" ", header = None).apply(np.exp, 1)

In [133]:
num_words_to_print = 20
for topic in range(beta.shape[0]):
    top_words = beta.loc[topic].sort_values(ascending=False).index
    genre_count = pd.DataFrame(np.zeros(len(genres)), index = genres)
    
    topic_str = ''
    topic_str += '%2d'%topic
    topic_str += ': '
    
    weights = False
    
    for i in range(num_words_to_print):
        movie_name =  movies.loc[top_words[i]]["Title"]
        movie_name = movie_name[:-7] if movie_name[-1] == ')' else movie_name
        topic_str += movie_name 
        if weights:
            topic_str += '(' + str(beta.loc[topic][top_words[i]].round(3)) +  ')'
        topic_str += ' | '
    print(topic_str)
    print()

 0: Popeye | Creature Comforts | Holiday Inn | South Pacific | Shadow of a Doubt | Soapdish | My Own Private Idaho | Coming Home | Jules and Jim (Jules et Jim) | Pet Sematary | American Gigolo | Oliver! | Carnal Knowledge | Last Man Standing | House on Haunted Hill, The | Nosferatu (Nosferatu, eine Symphonie des Grauens) | Funny Face | Hamlet | Boys on the Side | Battleship Potemkin, The (Bronenosets Potyomkin) | 

 1: Apocalypse Now | Life Is Beautiful (La Vita � bella) | Killing Fields, The | Crimson Tide | Commitments, The | Firm, The | Boogie Nights | Hamlet | Malcolm X | Cyrano de Bergerac | Eat Drink Man Woman | What's Eating Gilbert Grape | Three Days of the Condor | Rob Roy | Copycat | Eraser | Breakdown | River Wild, The | Mis�rables, Les | Six Degrees of Separation | 

 2: Godfather, The | Godfather: Part II, The | Male - 35-44 | Exorcist, The | Rosemary's Baby | Invasion of the Body Snatchers | American Werewolf in London, An | Clockwork Orange, A | No Way Out | Thing, The |

In [134]:
path = os.path.join(os.getcwd(), "..", "param", "ml1m", "theta.dat")
theta = pd.read_table(path, sep=" ", header = None).apply(np.exp, 1)

In [147]:
chosen_one = 765
users.loc[chosen_one]

UserID          766
Gender            F
Age              25
Occupation        7
Zip-code      95128
Name: 765, dtype: object

In [148]:
most_popular = beta.multiply(theta.loc[chosen_one], 0).sum().sort_values(ascending=False).index
pop_str = ''
for i in range(100):
    movie_name =  movies.loc[most_popular[i]]["Title"]
    movie_name = movie_name[:-7] if movie_name[-1] == ')' else movie_name
    pop_str += movie_name + ' | '
print(pop_str)

Princess Bride, The | Monty Python and the Holy Grail | Back to the Future | Fish Called Wanda, A | Raising Arizona | Animal House | Butch Cassidy and the Sundance Kid | Caddyshack | Romancing the Stone | E.T. the Extra-Terrestrial | Big | Breakfast Club, The | Ferris Bueller's Day Off | Stand by Me | Blazing Saddles | Young Frankenstein | Trading Places | Dangerous Liaisons | Wizard of Oz, The | Four Weddings and a Funeral | Bull Durham | Full Monty, The | Christmas Story, A | Groundhog Day | High Fidelity | In the Line of Fire | Splash | Star Wars: Episode IV - A New Hope | Spaceballs | Pretty Woman | Unforgiven | Broadcast News | Crying Game, The | Say Anything... | Shining, The | Heathers | Good Morning, Vietnam | Much Ado About Nothing | Sense and Sensibility | Big Chill, The | When Harry Met Sally... | Amadeus | Clueless | Natural, The | Rain Man | Mary Poppins | Midnight in the Garden of Good and Evil | Taxi Driver | To Catch a Thief | Little Mermaid, The | Dead Poets Society | 

In [137]:
most_popular = beta.sum().sort_values(ascending=False).index
pop_str = ''
for i in range(100):
    movie_name =  movies.loc[most_popular[i]]["Title"]
    movie_name = movie_name[:-7] if movie_name[-1] == ')' else movie_name
    pop_str += movie_name + ' | '
print(pop_str)

Male - 25-34 | Casablanca | Godfather, The | This Is Spinal Tap | Star Wars: Episode IV - A New Hope | American Beauty | Godfather: Part II, The | Vertigo | Taxi Driver | Princess Bride, The | Back to the Future | L.A. Confidential | Pulp Fiction | Chinatown | Ghostbusters | Being John Malkovich | Star Wars: Episode V - The Empire Strikes Back | Toy Story | North by Northwest | Full Metal Jacket | Alien | E.T. the Extra-Terrestrial | Who Framed Roger Rabbit? | Terminator, The | Indiana Jones and the Last Crusade | Raiders of the Lost Ark | Austin Powers: The Spy Who Shagged Me | Babe | Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb | Professional, The (a.k.a. Leon: The Professional) | Manchurian Candidate, The | Witness | Sixth Sense, The | Men in Black | Groundhog Day | Bug's Life, A | Dirty Dozen, The | Gladiator | Star Wars: Episode VI - Return of the Jedi | Usual Suspects, The | Apocalypse Now | Godfather: Part III, The | Jaws | Full Monty, The | Wizard of Oz,

In [None]:
path = os.path.join(os.getcwd(), "..", "param", "ml1m", "theta.dat")
gamma = pd.read_table(path, sep=" ", header = None)
gamma.loc[8].argmax()

In [None]:
for gen_doc in gender_docs:
    topic = gender_docs[gen_doc]
    
    top_words = beta.loc[topic].sort_values(ascending=False).index
    genre_count = pd.DataFrame(np.zeros(len(genres)), index = genres)

    if topic < 10:
        print topic, " :" ,
    else:
        print topic , ":" , 
        
    print gen_doc, " : "

    for i in xrange(50):
        movie_name =  movies.loc[top_words[i]]["Title"]
        if movie_name[-1] == ')':
            print movie_name[:-7], ' |', 
        else:
            print movie_name, ' |', 

    print
    print

In [None]:
topic = 6
top_words = beta.loc[topic].sort_values(ascending=False).index
genre_count = pd.DataFrame(np.zeros(len(genres)), index = genres)

if topic < 10:
    print topic, " :" ,
else:
    print topic , ":" , 

for i in xrange(200):
    movie_name =  movies.loc[top_words[i]]["Title"]
    if movie_name[-1] == ')':
        print movie_name[:-7], '(', beta.loc[topic][top_words[i]].round(3),  ') |', 
    else:
        print movie_name, ' |', 

print
print

In [None]:
fig, ax = plt.subplots(figsize=(20,10))

ind = np.arange(len(genres)) 
width = 1.0

topic = 29

num_items = 50

top_words = beta.loc[topic].sort_values(ascending=False).index
genre_count = pd.DataFrame(np.zeros(len(genres)), index = genres)

for i in xrange(num_items):
    for gn in movie_names.loc[movies[top_words[i]]][1].split('|'):
        genre_count.loc[gn] = genre_count.loc[gn] + 1

ax.bar(ind, genre_count.values, width=width, color='r')
ind = ind + width

ax.set_ylabel('Count')
ax.set_xlabel('Genre')
ax.set_title('Genre Counts per Topic')
ax.set_xticks(ind-0.5)
xlabels = []
for item in genres:
    xlabels.append(item)
ax.set_xticklabels(xlabels)
plt.show()

for i in xrange(num_items):
        print movie_names.loc[movies[top_words[i]]][0][:-7], ' |', 