In [1]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
path = os.path.join(os.getcwd(),"..","datasets", "ml1m", "ratings.dat")
ratings = pd.read_table(path, sep="::",  names=["UserID", 'MovieID', 'Rating', 'Timestamp'], header = None, engine='python').drop('Timestamp', axis=1)
ratings = ratings.pivot('UserID', 'MovieID', 'Rating').replace(0, np.nan).transpose().unstack().dropna()

path = os.path.join(os.getcwd(), "..","datasets", "ml1m", "movies.dat")
movies = pd.read_table(path, sep="::",  names=["MovieID", 'Title', 'Genre'], header = None, engine='python')

path = os.path.join(os.getcwd(), "..","datasets", "ml1m", "users.dat")
users = pd.read_table(path, sep="::", names=["UserID", "Gender", "Age", "Occupation", 'Zip-code'], header = None, engine='python')

In [3]:
path = os.path.join(os.getcwd(),"..","datasets", "ml1m", "ratings.dat")
ratings = pd.read_table(path, sep="::", index_col=[0,1], names=["UserID","MovieID", "Rating","Timestamp" ], header = None, engine='python').drop('Timestamp', axis=1)

In [8]:
male_ages = [[0,"Male",""], [0,"Male - Under 18", ""], [0,"Male - 18-24", ""], [0,"Male - 25-34", ""], 
             [0,"Male - 35-44", ""], [0,"Male - 45-49", ""], [0,"Male - 50-55", ""], [0,"Male - 56+", ""]]

female_ages = [[0,"Female",""], [0,"Female - Under 18", ""], [0,"Female - 18-24", ""], [0,"Female - 25-34", ""], 
               [0,"Female - 35-44", ""], [0,"Female - 45-49", ""], [0,"Female - 50-55", ""], [0,"Female - 56+", ""]]

mov_id = movies['MovieID'].max()

for ma in male_ages:
    mov_id = mov_id + 1
    ma[0] = mov_id
    
for fa in female_ages:
    mov_id = mov_id + 1
    fa[0] = mov_id

movies = movies.append(pd.DataFrame(male_ages, columns = ['MovieID', "Title","Genre"]), ignore_index=True)
movies = movies.append(pd.DataFrame(female_ages, columns = [ 'MovieID', "Title","Genre"]), ignore_index=True)

In [9]:
def get_gender_age_item(user_row):
    item = ""
    if(user_row["Gender"] == "M"):
        item = item + "Male"
    elif(user_row["Gender"] == "F"):
        item = item + "Female"
                
    if(user_row["Age"] == 1):
        item = item + " - Under 18"
    elif(user_row["Age"] == 18):
        item = item + " - 18-24"
    elif(user_row["Age"] == 25):
        item = item + " - 25-34"
    elif(user_row["Age"] == 35):
        item = item + " - 35-44"
    elif(user_row["Age"] == 45):
        item = item + " - 45-49"
    elif(user_row["Age"] == 50):
        item = item + " - 50-55"
    elif(user_row["Age"] == 56):
        item = item + " - 56+"
        
    return item
get_gender_age_item(users.loc[1])

'Male - 56+'

In [10]:
genres = set()
for row in movies.iterrows():
    for genre in row[1]['Genre'].split('|'):
        genres.add(genre)

In [75]:
path = os.path.join(os.getcwd(),"..","datasets", "ml1m.dat")
fo = open(path, "w")

line = str(movies.shape[0]) + " "
for movie in movies.iterrows():
    line += str(movie[0]) + ":" + str(1.0) + ' '
line += "\n"
fo.write(line)

gender_docs = dict()
doc_num = 0
for ma in male_ages:
    line = str(1) + " " + str(movies[(movies["Title"] == ma[1])].index[0]) + ":" + str(5.0) + "\n"
    gender_docs[ma[1]] = doc_num
    doc_num = doc_num + 1
    fo.write(line)
    
for fa in female_ages:
    line = str(1) + " " + str(movies[(movies["Title"] == fa[1])].index[0]) + ":" + str(5.0) + "\n"
    gender_docs[fa[1]] = doc_num
    doc_num = doc_num + 1
    fo.write(line)

for user in users.iterrows():
    user_index = user[0]
    user_id = user[1]['UserID']
    
    count = 1
    gender_age = movies[(movies["Title"] == get_gender_age_item(users.loc[user_index]))].index[0] 
    doc = str(gender_age) + ":" + str(5.0) + ' '
    
    user_ratings = ratings.loc[user_id]
    for movie_id in user_ratings.index:
        movie_index = movies[movies["MovieID"] == movie_id].index[0]
        doc = doc + str(movie_index) + ':' + str(user_ratings.loc[movie_id]["Rating"]) + " "
        count = count + 1
    doc = str(count) + " " + doc + "\n"

    fo.write(doc)
fo.close()

In [78]:
path = os.path.join(os.getcwd(), "..", "param", "ml1m", "beta.dat")
beta = pd.read_table(path, sep=" ", header = None)

for topic in xrange(50):
    top_words = beta.loc[topic].sort_values(ascending=False).index
    genre_count = pd.DataFrame(np.zeros(len(genres)), index = genres)
    
    if topic < 10:
        print topic, " :" ,
    else:
        print topic , ":" , 
        
    for i in xrange(50):
        movie_name =  movies.loc[top_words[i]]["Title"]
        if movie_name[-1] == ')':
            print movie_name[:-7], '(', beta.loc[topic][top_words[i]].round(3),  ') |', 
        else:
            print movie_name, ' |', '(', beta.loc[topic][top_words[i]].round(3),  ') |',
    
    print
    print

0  : One Flew Over the Cuckoo's Nest ( 0.011 ) | Graduate, The ( 0.01 ) | Godfather, The ( 0.01 ) | Taxi Driver ( 0.01 ) | Midnight Cowboy ( 0.01 ) | Godfather: Part II, The ( 0.009 ) | To Kill a Mockingbird ( 0.009 ) | Citizen Kane ( 0.009 ) | Casablanca ( 0.008 ) | Apocalypse Now ( 0.008 ) | Psycho ( 0.008 ) | Amadeus ( 0.008 ) | Bonnie and Clyde ( 0.008 ) | Chinatown ( 0.008 ) | Fargo ( 0.007 ) | GoodFellas ( 0.007 ) | Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb ( 0.007 ) | Network ( 0.007 ) | Cool Hand Luke ( 0.007 ) | North by Northwest ( 0.007 ) | Wizard of Oz, The ( 0.007 ) | Bridge on the River Kwai, The ( 0.007 ) | 2001: A Space Odyssey ( 0.007 ) | Schindler's List ( 0.007 ) | Raging Bull ( 0.007 ) | French Connection, The ( 0.006 ) | Deer Hunter, The ( 0.006 ) | Hustler, The ( 0.006 ) | Shawshank Redemption, The ( 0.006 ) | Silence of the Lambs, The ( 0.006 ) | Rear Window ( 0.006 ) | Dog Day Afternoon ( 0.006 ) | Serpico ( 0.006 ) | Annie Hall ( 0.00

In [69]:
gender_docs

{'Female': 8,
 'Female - 18-24': 10,
 'Female - 25-34': 11,
 'Female - 35-44': 12,
 'Female - 45-49': 13,
 'Female - 50-55': 14,
 'Female - 56+': 15,
 'Female - Under 18': 9,
 'Male': 0,
 'Male - 18-24': 2,
 'Male - 25-34': 3,
 'Male - 35-44': 4,
 'Male - 45-49': 5,
 'Male - 50-55': 6,
 'Male - 56+': 7,
 'Male - Under 18': 1}

In [81]:
path = os.path.join(os.getcwd(), "..", "param", "ml1m", "theta.dat")
gamma = pd.read_table(path, sep=" ", header = None)
gamma.loc[8].argmax()

8

In [80]:
for gen_doc in gender_docs:
    topic = gender_docs[gen_doc]
    
    top_words = beta.loc[topic].sort_values(ascending=False).index
    genre_count = pd.DataFrame(np.zeros(len(genres)), index = genres)

    if topic < 10:
        print topic, " :" ,
    else:
        print topic , ":" , 
        
    print gen_doc, " : "

    for i in xrange(50):
        movie_name =  movies.loc[top_words[i]]["Title"]
        if movie_name[-1] == ')':
            print movie_name[:-7], ' |', 
        else:
            print movie_name, ' |', 

    print
    print

7  : Male - 56+  : 
Almost Famous  | Requiem for a Dream  | Wonder Boys  | Dancer in the Dark  | All About My Mother (Todo Sobre Mi Madre)  | Boys Don't Cry  | Virgin Suicides, The  | High Fidelity  | Best in Show  | Nurse Betty  | Magnolia  | Straight Story, The  | Sweet Hereafter, The  | Breaking the Waves  | Erin Brockovich  | Central Station (Central do Brasil)  | Hilary and Jackie  | Contender, The  | Secrets & Lies  | Tao of Steve, The  | Talented Mr. Ripley, The  | Gods and Monsters  | Run Lola Run (Lola rennt)  | Red Violin, The (Le Violon rouge)  | Girlfight  | Jesus' Son  | Ice Storm, The  | Topsy-Turvy  | Meet the Parents  | Croupier  | Chicken Run  | Gladiator  | Sweet and Lowdown  | Dreamlife of Angels, The (La Vie r�v�e des anges)  | Opposite of Sex, The  | Cider House Rules, The  | Elizabeth  | Big Kahuna, The  | Keeping the Faith  | Winslow Boy, The  | Chuck & Buck  | East is East  | Me Myself I  | Wings of the Dove, The  | Sunshine  | Celebration, The (Festen)  | Boile

In [74]:
topic = 6
top_words = beta.loc[topic].sort_values(ascending=False).index
genre_count = pd.DataFrame(np.zeros(len(genres)), index = genres)

if topic < 10:
    print topic, " :" ,
else:
    print topic , ":" , 

for i in xrange(200):
    movie_name =  movies.loc[top_words[i]]["Title"]
    if movie_name[-1] == ')':
        print movie_name[:-7], '(', beta.loc[topic][top_words[i]].round(3),  ') |', 
    else:
        print movie_name, ' |', 

print
print

6  : Toy Story 2 ( 0.011 ) | Bug's Life, A ( 0.011 ) | Toy Story ( 0.011 ) | Titanic ( 0.01 ) | Lion King, The ( 0.01 ) | Apollo 13 ( 0.009 ) | Aladdin ( 0.009 ) | Beauty and the Beast ( 0.009 ) | Men in Black ( 0.008 ) | Forrest Gump ( 0.008 ) | Truman Show, The ( 0.008 ) | Babe ( 0.008 ) | You've Got Mail ( 0.007 ) | Mulan ( 0.007 ) | Sleepless in Seattle ( 0.007 ) | Jurassic Park ( 0.007 ) | Ghost ( 0.007 ) | Galaxy Quest ( 0.007 ) | Antz ( 0.006 ) | Mrs. Doubtfire ( 0.006 ) | Groundhog Day ( 0.006 ) | Pleasantville ( 0.006 ) | While You Were Sleeping ( 0.006 ) | October Sky ( 0.006 ) | Parent Trap, The ( 0.006 ) | As Good As It Gets ( 0.006 ) | Mr. Holland's Opus ( 0.006 ) | Braveheart ( 0.005 ) | Hunchback of Notre Dame, The ( 0.005 ) | Good Will Hunting ( 0.005 ) | Sixth Sense, The ( 0.005 ) | Tarzan ( 0.005 ) | Shakespeare in Love ( 0.005 ) | League of Their Own, A ( 0.005 ) | Star Wars: Episode I - The Phantom Menace ( 0.005 ) | Edward Scissorhands ( 0.005 ) | Ever After: A Cin

In [None]:
fig, ax = plt.subplots(figsize=(20,10))

ind = np.arange(len(genres)) 
width = 1.0

topic = 29

num_items = 50

top_words = beta.loc[topic].sort_values(ascending=False).index
genre_count = pd.DataFrame(np.zeros(len(genres)), index = genres)

for i in xrange(num_items):
    for gn in movie_names.loc[movies[top_words[i]]][1].split('|'):
        genre_count.loc[gn] = genre_count.loc[gn] + 1

ax.bar(ind, genre_count.values, width=width, color='r')
ind = ind + width

ax.set_ylabel('Count')
ax.set_xlabel('Genre')
ax.set_title('Genre Counts per Topic')
ax.set_xticks(ind-0.5)
xlabels = []
for item in genres:
    xlabels.append(item)
ax.set_xticklabels(xlabels)
plt.show()

for i in xrange(num_items):
        print movie_names.loc[movies[top_words[i]]][0][:-7], ' |', 