In [1]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
path = os.path.join(os.getcwd(),"..","datasets", "ml1m", "ratings.dat")
ratings = pd.read_table(path, sep="::",  names=["UserID", 'MovieID', 'Rating', 'Timestamp'], header = None, engine='python').drop('Timestamp', axis=1)
ratings = ratings.pivot('UserID', 'MovieID', 'Rating').replace(0, np.nan).transpose().unstack().dropna()

path = os.path.join(os.getcwd(), "..","datasets", "ml1m", "movies.dat")
movies = pd.read_table(path, sep="::",  names=["MovieID", 'Title', 'Genre'], header = None, engine='python')

path = os.path.join(os.getcwd(), "..","datasets", "ml1m", "users.dat")
users = pd.read_table(path, sep="::", names=["UserID", "Gender", "Age", "Occupation", 'Zip-code'], header = None, engine='python')

In [3]:
male_ages = [[0,"Male",""], [0,"Male - Under 18", ""], [0,"Male - 18-24", ""], [0,"Male - 25-34", ""], 
             [0,"Male - 35-44", ""], [0,"Male - 45-49", ""], [0,"Male - 50-55", ""], [0,"Male - 56+", ""]]

female_ages = [[0,"Female",""], [0,"Female - Under 18", ""], [0,"Female - 18-24", ""], [0,"Female - 25-34", ""], 
               [0,"Female - 35-44", ""], [0,"Female - 45-49", ""], [0,"Female - 50-55", ""], [0,"Female - 56+", ""]]

mov_id = movies['MovieID'].max()

for ma in male_ages:
    mov_id = mov_id + 1
    ma[0] = mov_id
    
for fa in female_ages:
    mov_id = mov_id + 1
    fa[0] = mov_id

movies = movies.append(pd.DataFrame(male_ages, columns = ['MovieID', "Title","Genre"]), ignore_index=True)
movies = movies.append(pd.DataFrame(female_ages, columns = [ 'MovieID', "Title","Genre"]), ignore_index=True)

In [4]:
def get_gender_age_item(user_row):
    item = ""
    if(user_row["Gender"] == "M"):
        item = item + "Male"
    elif(user_row["Gender"] == "F"):
        item = item + "Female"
                
    if(user_row["Age"] == 1):
        item = item + " - Under 18"
    elif(user_row["Age"] == 18):
        item = item + " - 18-24"
    elif(user_row["Age"] == 25):
        item = item + " - 25-34"
    elif(user_row["Age"] == 35):
        item = item + " - 35-44"
    elif(user_row["Age"] == 45):
        item = item + " - 45-49"
    elif(user_row["Age"] == 50):
        item = item + " - 50-55"
    elif(user_row["Age"] == 56):
        item = item + " - 56+"
        
    return item
get_gender_age_item(users.loc[1])

'Male - 56+'

In [5]:
genres = set()
for row in movies.iterrows():
    for genre in row[1]['Genre'].split('|'):
        genres.add(genre)

In [7]:
path = os.path.join(os.getcwd(),"..","datasets", "ml1m.dat")
fo = open(path, "w")

line = str(movies.shape[0]) + " "
for movie in movies.iterrows():
    line += str(movie[0]) + ":" + str(1.0) + ' '
line += "\n"
fo.write(line)

for user in users.iterrows():
    user_index = user[0]
    user_id = user[1]['UserID']
    
    count = 1
    gender_age = movies[(movies["Title"] == get_gender_age_item(users.loc[user_index]))].index[0] 
    doc = str(gender_age) + ":" + str(5.0) + ' '
    
    user_ratings = ratings.loc[user_id]
    for movie_id in user_ratings.index:
        movie_index = movies[movies["MovieID"] == movie_id].index[0]
        doc = doc + str(movie_index) + ':' + str(user_ratings[movie_id]) + " "
        count = count + 1
    doc = str(count) + " " + doc + "\n"

    fo.write(doc)
fo.close()

In [7]:
path = os.path.join(os.getcwd(), "..", "param", "ml1m", "beta.dat")
beta = pd.read_table(path, sep=" ", header = None)

for topic in xrange(50):
    top_words = beta.loc[topic].sort_values(ascending=False).index
    genre_count = pd.DataFrame(np.zeros(len(genres)), index = genres)
    
    if topic < 10:
        print topic, " :" ,
    else:
        print topic , ":" , 
        
    for i in xrange(100):
        movie_name =  movies.loc[top_words[i]]["Title"]
        if movie_name[-1] == ')':
            print movie_name[:-7], ' |', 
        else:
            print movie_name, ' |', 
    
    print
    print

0  : Terminator, The  | Die Hard  | Aliens  | Blade Runner  | Rocky  | Shining, The  | Jaws  | Untouchables, The  | Full Metal Jacket  | Godfather, The  | Raging Bull  | This Is Spinal Tap  | Godfather: Part II, The  | Alien  | Mad Max 2 (a.k.a. The Road Warrior)  | Deliverance  | Raiders of the Lost Ark  | Taxi Driver  | Platoon  | Monty Python and the Holy Grail  | Blazing Saddles  | Witness  | Star Wars: Episode IV - A New Hope  | Exorcist, The  | Clockwork Orange, A  | Apocalypse Now  | Silence of the Lambs, The  | E.T. the Extra-Terrestrial  | Blues Brothers, The  | Close Encounters of the Third Kind  | Poltergeist  | Terminator 2: Judgment Day  | Halloween  | Lethal Weapon  | Raising Arizona  | Midnight Express  | Airplane!  | Animal House  | Carrie  | Mad Max  | Omen, The  | Marathon Man  | Unforgiven  | Right Stuff, The  | Willy Wonka and the Chocolate Factory  | Goldfinger  | Fish Called Wanda, A  | Young Frankenstein  | Beetlejuice  | Deer Hunter, The  | Stand by Me  | Midnig

In [None]:
fig, ax = plt.subplots(figsize=(20,10))

ind = np.arange(len(genres)) 
width = 1.0

topic = 29

num_items = 50

top_words = beta.loc[topic].sort_values(ascending=False).index
genre_count = pd.DataFrame(np.zeros(len(genres)), index = genres)

for i in xrange(num_items):
    for gn in movie_names.loc[movies[top_words[i]]][1].split('|'):
        genre_count.loc[gn] = genre_count.loc[gn] + 1

ax.bar(ind, genre_count.values, width=width, color='r')
ind = ind + width

ax.set_ylabel('Count')
ax.set_xlabel('Genre')
ax.set_title('Genre Counts per Topic')
ax.set_xticks(ind-0.5)
xlabels = []
for item in genres:
    xlabels.append(item)
ax.set_xticklabels(xlabels)
plt.show()

for i in xrange(num_items):
        print movie_names.loc[movies[top_words[i]]][0][:-7], ' |', 