In [1]:
import pandas as pd
import numpy as np
import math
import time
import os
import scipy.sparse as sparse

In [2]:
LARGE_DATASET = False

In [3]:
if LARGE_DATASET:
    movie_path = "ml-latest/movies.csv"
    tags_path = "ml-latest/tags.csv"
    ratings_path = "ml-latest/ratings_learning.csv"
    movie_profiles_path = "/media/karol/E8B625B3B6258364/calculated/movie_profiles"
    user_profiles_path = "/media/karol/E8B625B3B6258364/calculated/user_profiles/"
else:
    movie_path = "ml-latest-small/movies.csv"
    tags_path = "ml-latest-small/tags.csv"
    ratings_path = "ml-latest-small/split/ratings_learning.csv"
    movie_profiles_path = "/media/karol/E8B625B3B6258364/calculated-small/movie_profiles"
    user_profiles_path = "/media/karol/E8B625B3B6258364/calculated-small/user_profiles/"

In [4]:
def group(data_frame, group_by):
    return data_frame.groupby(group_by, as_index = False, sort = False)

In [5]:
def read_user_profile(userId):
    directory = user_profiles_path + str(userId // 1000) + "/" + str(userId)
    user_profile = pd.read_csv(directory,  header=None, usecols=[1,2,3])
    return user_profile.rename(columns={1:"tag",2:"u_weight",3:"userId"})

# Załadowanie oraz przygotowanie danych

In [6]:
movies_data = pd.read_csv(movie_path)
tags_data = pd.read_csv(tags_path)
ratings_data = pd.read_csv(ratings_path)
for i in range(tags_data.shape[0]):
    tags_data.set_value(i, "tag", str(tags_data["tag"][i]).lower())
    
user_ids = ratings_data['userId'].drop_duplicates()

In [7]:
mvs = tags_data['movieId'].drop_duplicates()
unique_tags = tags_data[['tag']].drop_duplicates()

In [None]:
mf = open(movie_profiles_path, 'r')
movie_profiles = pd.read_csv(mf).rename(columns={'weight':'m_weight'})

def get_movie_profile(movieId):
    return movie_profiles[movie_profiles['movieId'] == movieId]

# Tagi: Obliczenie TF-IDF

In [10]:
# movie_tag_count: opisuje ile razy dany tag został użyty do opisania danego filmu
movie_tag_count = group(tags_data,['movieId','tag'])\
    .count()                                        \
    .rename(columns = {'userId': 'tag_count'})[['movieId','tag','tag_count']]

In [11]:
distinct_tags = tags_data[['tag','movieId']].drop_duplicates()

# tags_occurence: liczba filmów które zostały opisane za pomocą danego taga
tags_occurence = group(distinct_tags, ['tag'])\
        .count()                              \
        .rename(columns = {'movieId': 'tag_count'})[['tag','tag_count']]

In [12]:
# document_count: liczba filmów
document_count = len(tags_data['movieId'].drop_duplicates())

# DF['IDF']: inverse document frequency
DF = tags_occurence
DF['IDF'] = np.log10(document_count / DF['tag_count'])

# TF: document frequency
TF = movie_tag_count.rename(columns ={'tag_count': 'TF'})

# obliczenie wartości tf-idf dla każdego tagu
TF_IDF = pd.merge(TF, DF, on='tag', how='left', sort=False)
TF_IDF['TF-IDF'] = TF_IDF['TF'] * TF_IDF['IDF']
TF_IDF = TF_IDF.drop(['TF','IDF', 'tag_count'], 1)

# Tags: Normalize vectors to unit length

In [13]:
# wyznaczenie długości wektorów tf-idf (każdy film ma swój)
V = TF_IDF.drop('tag',1)
V['V'] = (V['TF-IDF']**2)

V = group(V,['movieId']).sum().drop("TF-IDF",1)
V['V'] = np.sqrt(V[['V']].sum(1))

# podzielenie wektorów przez ich długość, aby otrzymać wektory o długości 1
# to potem ułatwia obliczenie odległości kosinusowej
TF_IDF = pd.merge(TF_IDF, V, on='movieId', how = 'left',sort=False)
TF_IDF['weight'] = TF_IDF['TF-IDF'] / TF_IDF['V']
TF_IDF = TF_IDF.drop(['V','TF-IDF'],1)

Saved in a file for later calculations

In [14]:
TF_IDF.shape
f = open(movie_profiles_path, 'w')
TF_IDF.to_csv(f, index=False)

# Users: Calculate user profiles

In [19]:
ratings_data = pd.read_csv(ratings_path)

In [20]:
users_ratings = group(ratings_data,['userId'])
user_ids = ratings_data[['userId']].drop_duplicates()

Obliczenie profili użytkowników, polega na zebraniu wszystkich filmów które dany użytkownik ocenił, później dla każdego z tych filmów zbierane są wszystkie jego tagi, waga każdego taga jest pomnożona przez różnicę oceny tego filmu i średniej ocen tego użytkownika.

In [21]:
start = time.time()
n = 1000;
first = True
for _, user in user_ids.iterrows():
    userId = user['userId']
    directory = user_profiles_path + str(userId // 1000)
    if not os.path.exists(directory):
        os.makedirs(directory)
    f = open(directory + '/' + str(userId), 'w')
    ratings = ratings_data[ratings_data['userId'] == userId]
    mean = ratings['rating'].mean()
    user_data = ratings.drop(['timestamp','userId'],1)
    user_data['uweight'] = user_data['rating'] - mean
    user_data1 = pd.merge(TF_IDF, user_data,on = 'movieId',how='inner',sort=False)
    user_data1['weight'] = user_data1["uweight"] * user_data1['weight']
    user_profile = group(user_data1,['tag']).sum()
    user_profile = user_profile.drop(['movieId','rating','uweight'], 1)
    user_profile['userId'] = userId
    user_profile.to_csv(f, mode='a', header=first)
    first = False
    if userId > n:
        n += 1000;
        print(time.time() - start)
        start = time.time()
        print(userId)
end = time.time()
print(end - start)

5.594572305679321


Profile wszystkich 270tyś. użytkowników w postaci .csv zajmowały ponad 36gb i liczyły się ponad 3.5h