In [1]:
import os
import pickle
import scipy
import sklearn
import csv
import pandas as pd
from tqdm import tqdm
import numpy as np

def load_item_dataset(path_to_dataset):
    dataset = []
    for movie_features in os.listdir(path_to_dataset):
        movie = {}
        
        
        
        movie_id = movie_features.lstrip('0')
        movie_id = movie_id[:-2]
        movie['movie_id'] = movie_id
        
        infile = open(f"{path_to_dataset}/{movie_features}", "rb")
        movie_data = pickle.load(infile)
        infile.close()       
        movie['data'] = movie_data
        
        
        
        dataset.append(movie)
    
    return dataset
    
def load_movielens(path_to_data):
    as_df = pd.read_csv(path_to_data)
    return as_df


def labels_to_string(movie):
    movie_words = ''
    for frame in movie['data']:
        word = frame[0][1]
        movie_words += f' {word}'
        
    return movie_words

def labels_to_list(movie):
    movie_labels = []
    for frame in movie['data']:
        word = frame[0][1].replace('_', '')
        movie_labels.append(word)
        
    return movie_labels

def words_to_vocab(words):
    vocab = {}
    for word in words:
        if word not in vocab.keys():
            vocab[word] = 1
        else:
            vocab[word] += 1
    return vocab

from sklearn.feature_extraction.text import CountVectorizer

def get_all_words(label_dataset):    
    all_words = []
    for movie in label_dataset:
        movie_words = labels_to_string(movie)            
        all_words.append(movie_words)
        
    return all_words

def create_label_features(movie):
  label_dict = dict()
  for frame in movie['data']:
    word = frame[0][1]
    if word not in label_dict.keys():
      label_dict[word] = np.array([frame[0][2]])
    else:
      label_dict[word] = np.append(label_dict[word], frame[0][2])

  for word in label_dict.keys():
    label_dict[word] = label_dict[word].mean()
  
  return {'movie_id':movie['movie_id'], 'labels':label_dict}
          

In [None]:
# Load datasets

#features_dataset = load_item_dataset('../data/features')
labels_dataset = load_item_dataset('D:/Masterprosjekt/Labels')
ratings_dataset = load_movielens('../data/ml-20m/ratings.csv')
ratings_dataset = ratings_dataset.drop(columns=['timestamp'])

In [5]:
pickle.dump(labels_dataset, (open('../data/labels_dataset_raw.p', 'wb')))
pickle.dump(ratings_dataset, (open('../data/ratings_dataset_raw.p', 'wb')))

In [None]:
# Labels confidence
labels_dataset = load_item_dataset('D:/Masterprosjekt/Labels')
labels_dataset = map(create_label_features, labels_dataset)

labels_dataset = pd.DataFrame.from_records(labels_dataset)

In [4]:
# Remove interactions (ratings) for all movies not in the labels dataset

unique = ratings_dataset.movieId.unique()
unique = list(map(str, unique))
# Find all movies not present in the ratings dataset
for movie in labels_dataset:
    if movie['movie_id'] in unique:
        unique.remove(movie['movie_id'])

# Remove all movies not present in the ratings dataset
for value in tqdm(unique):
    indexNames = ratings_dataset[ratings_dataset['movieId'] == int(value)].index
    ratings_dataset.drop(indexNames, inplace=True)


  0%|          | 0/26734 [00:00<?, ?it/s]  0%|          | 1/26734 [00:05<41:22:04,  5.57s/it]  0%|          | 2/26734 [00:17<55:32:21,  7.48s/it]  0%|          | 3/26734 [00:26<57:45:21,  7.78s/it]  0%|          | 4/26734 [00:33<56:39:45,  7.63s/it]  0%|          | 5/26734 [00:39<54:27:35,  7.33s/it]  0%|          | 6/26734 [00:45<49:32:04,  6.67s/it]  0%|          | 7/26734 [00:50<46:31:56,  6.27s/it]  0%|          | 8/26734 [00:56<45:58:16,  6.19s/it]  0%|          | 9/26734 [01:01<43:49:42,  5.90s/it]  0%|          | 10/26734 [01:06<41:50:50,  5.64s/it]  0%|          | 11/26734 [01:14<46:00:43,  6.20s/it]  0%|          | 12/26734 [01:19<45:04:47,  6.07s/it]  0%|          | 13/26734 [01:25<42:59:06,  5.79s/it]  0%|          | 13/26734 [01:27<50:04:40,  6.75s/it]


KeyboardInterrupt: 

In [49]:
# Remove all movies not present in labels dataset

unique = ratings_dataset.movieId.unique()

unique = set(map(str, unique))

In [61]:
print(len(labels_dataset))
for movie in labels_dataset:
    if movie['movie_id'] not in unique:
        print(movie['movie_id'])
        labels_dataset.remove(movie)


3254


In [63]:
full_vocab = {}


for movie in labels_dataset:
    vocab = words_to_vocab(labels_to_list(movie))
    movie['corpus'] = vocab


for corpus in (movie['corpus'] for movie in labels_dataset):
    for word in corpus.keys():
        if word not in full_vocab.keys():
            full_vocab[word] = 1
        else: 
            full_vocab[word] += corpus[word]


In [3]:
# Vectorize labels

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(get_all_words(labels_dataset)).toarray()

In [4]:
# Normalize vectors

import sklearn.preprocessing as preprocessing

scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
X_scaled = scaler.fit_transform(X)

In [26]:
# Load list of vectors for each movie into labels dataset in column named 'labels'
labels_dataset['labels'] = labels_dataset['labels'].astype('object')
i = 0
for movie in X_scaled:
    labels_dataset.at[i , 'labels'] = movie.tolist()
    i += 1

In [10]:
# Load vectors into labels dataset where each possible label has its own column

i = 0
for movie in X_scaled:
    y = 0
    for label in movie:
        labels_dataset[i][f'{y}'] = label
        y += 1
    i+=1

In [41]:
# Convert labels dataset into DataFrame

labels_dataset = pd.DataFrame.from_records(labels_dataset)
labels_dataset = labels_dataset.drop(columns='data')

In [65]:
for word in full_vocab.keys():
    labels_dataset[word] = 0

for index, row in labels_dataset.iterrows():
    for word in labels_dataset.keys():
        if word in row['corpus'].keys():
            # labels_dataset.at[index, word] = row['corpus'][word]
            labels_dataset.at[index, word] = 1
            
labels_dataset = labels_dataset.drop(columns='corpus')

In [126]:
pickle.dump(labels_dataset, (open('../data/labels_dataset.p', 'wb')))
pickle.dump(ratings_dataset, (open('../data/ratings_dataset.p', 'wb')))


In [2]:
infile = open("../data/labels_dataset.p", "rb")
labels_dataset = pickle.load(infile)
infile.close()
infile = open("../data/ratings_dataset.p", "rb")
ratings_dataset = pickle.load(infile)
infile.close()

In [67]:
# Split ratings dataset into train and test

from sklearn.model_selection import train_test_split

train, test = train_test_split(ratings_dataset, test_size=0.2)

In [68]:
# Create lightfm Dataset
# Create ID mappings between users and movies

from lightfm.data import Dataset

dataset = Dataset()
dataset.fit((row['userId'] for index, row in ratings_dataset.iterrows()), 
            (row['movieId'] for index, row in ratings_dataset.iterrows()))

In [69]:
print(labels_dataset.shape)

(3254, 985)


In [70]:
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

Num users: 610, num_items 3254.


In [77]:
columns = labels_dataset.columns
possible_features = []
for i in range(len(columns)):
    if(columns[i]) != 'movie_id':
        possible_features.append(f'{columns[i]}:0')
        possible_features.append(f'{columns[i]}:1')
        
dataset.fit_partial(item_features = possible_features) 

In [27]:
# Add item features to the lightfm Dataset when labels are given as a 
# list for each movie in labels_dataset['labels']

dataset.fit_partial(
            item_features = (row['labels'] for index, row in labels_dataset.iterrows())
        )

TypeError: unhashable type: 'list'

In [20]:
# Add item features to the lightfm Dataset when each possible label 
# has its own column in labels_dataset

for column in labels_dataset:
    if column != 'movie_id': 
        dataset.fit_partial(
            items = (row['movie_id'] for index, row in labels_dataset.iterrows()),
            item_features = (row[str(column)] for index, row in labels_dataset.iterrows())
        )


In [78]:
# Build the interaction matrix for the lightfm Dataset

(interactions, weights) = dataset.build_interactions(((row['userId'], row['movieId'], row['rating']) for index, row in ratings_dataset.iterrows()))

print(repr(interactions))

<610x3254 sparse matrix of type '<class 'numpy.int32'>'
	with 43381 stored elements in COOrdinate format>


In [125]:
pickle.dump(dataset, (open('../data/lightfm_dataset.p', 'wb')))
pickle.dump(interactions, (open('../data/lightfm_interactions.p', 'wb')))
pickle.dump(weights, (open('../data/lightfm_weights.p', 'wb')))

In [3]:
infile = open("../data/lightfm_dataset.p", "rb")
dataset = pickle.load(infile)
infile.close()
infile = open("../data/lightfm_interactions.p", "rb")
interactions = pickle.load(infile)
infile.close()
infile = open("../data/lightfm_weights.p", "rb")
weights = pickle.load(infile)
infile.close()

In [None]:
# Build the item features matrix for the lightfm Dataset when labels are given as a 
# list for each movie in labels_dataset['labels']

item_features = dataset.build_item_features(((row['movie_id'], row['labels'])
                                              for index, row in labels_dataset.iterrows()))

In [20]:
columns = labels_dataset.columns
item_tuples = []
for i, r in labels_dataset.iterrows():
    features = []
    movie_id = r['movie_id']
    for y in range(len(columns)):
        if(columns[y]) != 'movie_id':
            features.append(f'{columns[y]}:{labels_dataset.at[i, columns[y]]}')
    
    item_tuples.append((int(movie_id), features))


In [21]:
# Build the item features matrix for the lightfm Dataset when each possible label 
# has its own column in labels_dataset 

item_features = dataset.build_item_features(item_tuples, normalize=False)

In [53]:
pickle.dump(item_features, open('../data/item_features.p', 'wb'))

In [109]:
item_features.todense()

matrix([[1., 0., 0., ..., 0., 1., 0.],
        [0., 1., 0., ..., 0., 1., 0.],
        [0., 0., 1., ..., 0., 1., 0.],
        ...,
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 1., 0.]], dtype=float32)

In [110]:
from sklearn.model_selection import train_test_split

interactions_csr = interactions.tocsr()

train, test = train_test_split(interactions_csr, test_size=0.2)
train = train.tocoo()
test = test.tocoo()

In [42]:
from lightfm.cross_validation import random_train_test_split

train, test = random_train_test_split(interactions, test_percentage=0.2)

In [43]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

# Set the number of threads; you can increase this
# ify you have more physical cores available.
NUM_THREADS = 2
NUM_COMPONENTS = 30
NUM_EPOCHS = 3
ITEM_ALPHA = 1e-6

model = LightFM(loss='warp',
                item_alpha=ITEM_ALPHA,
                no_components=NUM_COMPONENTS)

%time model = model.fit(train, epochs=NUM_EPOCHS, num_threads=NUM_THREADS)

Wall time: 2.68 s


In [44]:
train_auc = auc_score(model, train, num_threads=NUM_THREADS).mean()
print('Rating based train AUC: %s' % train_auc)
train_precision = precision_at_k(model, 
                                 train,
                                 num_threads=NUM_THREADS).mean()
print('Rating based train precision: %s' % train_precision)

Rating based train AUC: 0.9324894
Rating based train precision: 0.40786886


In [45]:
test_auc = auc_score(model, test, train_interactions=train, num_threads=NUM_THREADS).mean()
print('Rating based test AUC: %s' % test_auc)
test_precision = precision_at_k(model, 
                                 test, 
                                 train_interactions=train,
                                 num_threads=NUM_THREADS).mean()
print('Rating based test precision: %s' % test_precision)

Rating based test AUC: 0.9090621
Rating based test precision: 0.17487438


In [47]:
# Set biases to zero
model.item_biases *= 0.0

test_auc = auc_score(model, test, num_threads=NUM_THREADS).mean()
test_precision = precision_at_k(model, 
                                 test, 
                                 train_interactions=train,
                                 num_threads=NUM_THREADS).mean()
print('Rating based test AUC: %s' % test_auc)
print('Rating based test precision: %s' %test_precision)

Rating based test AUC: 0.89363015
Rating based test precision: 0.14288108


In [48]:
# Define a new model instance
model = LightFM(loss='warp',
                item_alpha=ITEM_ALPHA,
                no_components=NUM_COMPONENTS)

# Fit the hybrid model. Note that this time, we pass
# in the item features matrix.
model = model.fit(train,
                item_features=item_features,
                epochs=NUM_EPOCHS,
                num_threads=NUM_THREADS)

In [49]:
# Don't forget the pass in the item features again!
train_auc = auc_score(model,
                      train,
                      item_features=item_features,
                      num_threads=NUM_THREADS).mean()
print('Hybrid training set AUC: %s' % train_auc)
train_precision = precision_at_k(model, 
                                 train, 
                                 item_features=item_features,
                                 num_threads=NUM_THREADS).mean()
print('Hybrid training set precision: %s' % train_precision)

Hybrid training set AUC: 0.85032
Hybrid training set precision: 0.18606557


In [51]:
test_auc = auc_score(model,
                    test,
                    train_interactions=train,
                    item_features=item_features,
                    num_threads=NUM_THREADS).mean()
print('Hybrid test set AUC: %s' % test_auc)
test_precision = precision_at_k(model, 
                                 test, 
                                 train_interactions=train,
                                 item_features=item_features,
                                 num_threads=NUM_THREADS,
                                k=10).mean()
print('Hybrid test set precision: %s' % test_precision)

Hybrid test set AUC: 0.75393724
Hybrid test set precision: 0.054606363


In [None]:
train_precision = precision_at_k(model, train, item_features=item_features, k=10).mean()
test_precision = precision_at_k(model, test, item_features=item_features, k=10).mean()

train_auc = auc_score(model, train, item_features=item_features).mean()
test_auc = auc_score(model, test, item_features=item_features).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))


In [18]:
infile = open('Output.txt', 'w')
for columns in labels_dataset.columns:
    if columns != 'movie_id':
        infile.write(rf"row['{columns}'], ")
infile.close()

In [56]:
infile = open("../data/labels_dataset.p", "rb")
test = pickle.load(infile)
infile.close()

test.columns

Index(['movie_id', 'bowtie', 'spotlight', 'vendingmachine', 'firescreen',
       'sax', 'seatbelt', 'matchstick', 'planetarium', 'abaya',
       ...
       'carbonara', 'manholecover', 'jay', 'tigerbeetle', 'cricket',
       'soupbowl', 'pizza', 'fig', 'gartersnake', 'ruffedgrouse'],
      dtype='object', length=985)