In [None]:
'''
Bygge datasett (dtype=vocabulary) med interaction matrix, 
item feature matrix, feature list.

Interaction matrix: 
    dtype = csr matrix 
    index = userId, columns = movieId, values = rating(weight)
    Se på hvordan weights fungerer og normaliser rating utifra det
      create_interaction_matrix() fra recsys cookbook:
          Mulighet for å velge threshold - hva som regnes som pos/neg
Item feature matrix:
    dtype = csr matrix
    index = movieId, columns = features/labels, values = binært? normalisert?
Item features:
    liste over alle item features/labels
    Må korrespondere med kolonnene i item feature matrix

'''

'\nBygge datasett (dtype=vocabulary) med interaction matrix, \nitem feature matrix, feature list.\n\nInteraction matrix: \n    dtype = csr matrix \n    index = userId, columns = movieId, values = rating(weight)\n    Se på hvordan weights fungerer og normaliser rating utifra det\n      create_interaction_matrix() fra recsys cookbook:\n          Mulighet for å velge threshold - hva som regnes som pos/neg\nItem feature matrix:\n    dtype = csr matrix\n    index = movieId, columns = features/labels, values = binært? normalisert?\nItem features:\n    liste over alle item features/labels\n    Må korrespondere med kolonnene i item feature matrix\n\n'

In [None]:
!pip install lightfm
!pip install pandas==1.1.0 as pd



In [None]:
'''
for data profiling
import sys
!{sys.executable} -m pip install -U pandas-profiling[notebook]
!jupyter nbextension enable --py widgetsnbextension
'''

'\nfor data profiling\nimport sys\n!{sys.executable} -m pip install -U pandas-profiling[notebook]\n!jupyter nbextension enable --py widgetsnbextension\n'

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# Necessary imports
import pandas as pd
import numpy as np
import pickle
from pandas_profiling import ProfileReport
from scipy.sparse import csr_matrix
from scipy.sparse import lil_matrix
from sklearn.preprocessing import MinMaxScaler
from lightfm.cross_validation import random_train_test_split

In [None]:
data_dir = '/content/drive/My Drive/Master/Data'
'''
 Load item features and interaction data
 Item features:
    dtype: dict
    keys: movie_id, data
    data: list of list labels per frame in a movie 
          (3 tuples with labels pr frame)
          [[('feat1_id', 'feat_name', 'confidance),(feat2_id,..)],...]
'''

# Item features
infile = open(f'{data_dir}/labels_dataset_raw (1).p', 'rb')
item_features = pickle.load(infile)
infile.close()

# Subtitle features (loaded here to ensure the same movies in the dsets)
infile = open(f'{data_dir}/subs_df.p', 'rb')
subs_df = pickle.load(infile)
infile.close()

'''
infile = open(f'{data_dir}/ratings_dataset_raw.p', 'rb')
interactions_df = pickle.load(infile)
infile.close()
infile = open(f'{data_dir}/subs_df.p', 'rb')
subs_df = pickle.load(infile)
infile.close()
movies_df = pd.read_csv(f'{data_dir}/movies (1).csv')
'''


# Movielens 1M Dataset
movies_df = pd.read_csv(f'{data_dir}/movies.dat', 
                        sep="::", 
                        usecols = [0, 1], 
                        names = ['movieId', 'title'], 
                        engine = 'python')
interactions_df = pd.read_csv(f'{data_dir}/ratings.dat', 
                        sep="::", 
                        usecols = [0, 1, 2], 
                        names = ['userId', 'movieId', 'rating'], 
                        engine = 'python')


In [None]:
'''

infile = open(f'{data_dir}/labels_dataset_raw (1).p', 'rb')
item_features = pickle.load(infile)
infile.close()
interactions_df = pd.read_csv(f'{data_dir}/ratings.csv')
movies_df = pd.read_csv(f'{data_dir}/movies (2).csv')
'''

"\n\ninfile = open(f'{data_dir}/labels_dataset_raw (1).p', 'rb')\nitem_features = pickle.load(infile)\ninfile.close()\ninteractions_df = pd.read_csv(f'{data_dir}/ratings.csv')\nmovies_df = pd.read_csv(f'{data_dir}/movies (2).csv')\n"

In [None]:
final_dataset = {}
full_vocab = {}

In [None]:

# Takes a movie dict (keys: movie_id, data)
# returns a list of all top labels pr frame occuring in the movie
def labels_to_list(data):
  movie_labels = []
  for frame in data:
    word = frame[0][1]
    movie_labels.append(word)
  return movie_labels
# Takes a list of words
# Returns a vocab with every unique word and
# the number of times it occurs in the list
def words_to_vocab(words):
  vocab = {}
  for word in words:
    if word not in vocab.keys():
      vocab[word] = 1
    else:
      vocab[word] += 1
  return vocab

# Add a word to the global vocabulary
def add_to_vocab(word):
  if word not in full_vocab.keys():
    full_vocab[word] = 1
  else: 
    full_vocab[word] += 1

# Creates item features based on confidence
def create_confidence_features(movie):
  label_dict = dict()
  for frame in movie['data']:
    word = frame[0][1]
    if word not in label_dict.keys():
      label_dict[word] = np.array([frame[0][2]])
    else:
      label_dict[word] = np.append(label_dict[word], frame[0][2])

  for word in label_dict.keys():
    label_dict[word] = label_dict[word].mean()
  
  return {'movieId':movie['movie_id'], 'labels':label_dict}

def not_in_list(list1, list2):
  not_in_list = map(lambda x: x not in list2, list1)
  return not_in_list


In [None]:
# Create DataFrame with label features and their respective confidence

conf_features_df = map(create_confidence_features, item_features)
conf_features_df = pd.DataFrame.from_records(conf_features_df)
conf_features_df['movieId'] = conf_features_df['movieId'].astype(int)

In [None]:
# convert item features (frequency) to df
item_features_df = pd.DataFrame.from_records(item_features)
item_features_df.rename(columns={'movie_id':'movieId'}, inplace=True)
# movies_df.drop(columns='genres', inplace=True)
item_features_df['movieId'] = item_features_df['movieId'].astype(int)
print(item_features_df.head)

In [None]:
# 10M dataset:
# Drop half of the movies in the interactions matrix (because of hardware restrictions...)

interactions_movies = pd.DataFrame(interactions_df.movieId.unique()).rename(columns={0:'movieId'})
rand = np.random.default_rng()
to_drop = rand.integers(0,len(interactions_movies.movieId.unique()), 
                        size=int(len(interactions_df.movieId.unique())/2))
interactions_movies.drop(index=to_drop, inplace=True)
interactions_df = pd.merge(interactions_df, interactions_movies, on='movieId')
len(interactions_df.movieId.unique())

6471

In [None]:
# Include only overlapping movies

features_movies = pd.DataFrame(item_features_df.movieId.unique()).rename(columns={0:'movieId'})

features_union_interactions = pd.merge(features_movies, interactions_movies, on='movieId')

interactions_df = pd.merge(interactions_df, features_union_interactions, on='movieId')
item_features_df = pd.merge(pd.merge(item_features_df, features_union_interactions, on='movieId'), movies_df, on='movieId')
conf_features_df = pd.merge(conf_features_df, features_union_interactions, on='movieId')

In [None]:
item_features_df

Unnamed: 0,movieId,data,title
0,89,"[[(n02883205, bow_tie, 0.09820818), (n02951585...",Nick of Time (1995)
1,92,[],Mary Reilly (1996)
2,94,"[[(n01930112, nematode, 0.045145374), (n042865...",Beautiful Girls (1996)
3,99,"[[(n04589890, window_screen, 0.12671871), (n04...",Heidi Fleiss: Hollywood Madam (1995)
4,100,"[[(n04266014, space_shuttle, 0.58021516), (n03...",City Hall (1996)
...,...,...,...
3522,64921,"[[(n04266014, space_shuttle, 0.14096697), (n03...",Arabian Nights (1942)
3523,64926,"[[(n03424325, gasmask, 0.12416885), (n03110669...","Battle of Russia, The (Why We Fight, 5) (1943)"
3524,64944,"[[(n04239074, sliding_door, 0.13018622), (n034...",Face of a Fugitive (1959)
3525,65025,"[[(n04023962, punching_bag, 0.06733158), (n026...",Double Dynamite (1951)


In [None]:
pickle.dump(interactions_df, (open('/content/drive/My Drive/Master/Data/1Minteractions_df.p', 'wb')))
pickle.dump(item_features_df, (open('/content/drive/My Drive/Master/Data/1Mitem_features_df.p', 'wb')))
pickle.dump(conf_features_df, (open('/content/drive/My Drive/Master/Data/1Mconf_features_df.p', 'wb')))
pickle.dump(subs_df, (open('/content/drive/My Drive/Master/Data/1Msubs_df.p', 'wb')))

In [None]:
infile = open(f'/content/drive/My Drive/Master/Data/1Mitem_features_df.p', 'rb')
item_features_df = pickle.load(infile)
infile.close()
infile = open(f'/content/drive/My Drive/Master/Data/1Minteractions_df.p', 'rb')
interactions_df = pickle.load(infile)
infile.close()
infile = open(f'/content/drive/My Drive/Master/Data/1Mconf_features_df.p', 'rb')
conf_features_df = pickle.load(infile)
infile.close()
infile = open(f'/content/drive/My Drive/Master/Data/1Msubs_df.p', 'rb')
subs_df = pickle.load(infile)
infile.close()
# interactions_df.drop(columns='timestamp', inplace=True)

In [None]:
def create_interaction_matrix(df,user_col, item_col, rating_col, norm= False, threshold = None):
    '''
    @author: Aayush Agrawal
    @source: https://github.com/aayushmnit/cookbook/blob/master/recsys.py
    
    Function to create an interaction matrix dataframe from transactional type interactions
    Required Input -
        - df = Pandas DataFrame containing user-item interactions
        - user_col = column name containing user's identifier
        - item_col = column name containing item's identifier
        - rating col = column name containing user feedback on interaction with a given item
        - norm (optional) = True if a normalization of ratings is needed
        - threshold (required if norm = True) = value above which the rating is favorable
    Expected output - 
        - Pandas dataframe with user-item interactions ready to be fed in a recommendation algorithm
    '''
    interactions = df.groupby([user_col, item_col])[rating_col] \
            .sum().unstack().reset_index(). \
            fillna(0).set_index(user_col)
    if norm:
        interactions = interactions.applymap(lambda x: 1 if x > threshold else 0)
    return interactions


def create_user_dict(interactions):
    '''
    @author: Aayush Agrawal
    @source: https://github.com/aayushmnit/cookbook/blob/master/recsys.py

    Function to create a user dictionary based on their index and number in interaction dataset
    Required Input - 
        interactions - dataset created by create_interaction_matrix
    Expected Output -
        user_dict - Dictionary type output containing interaction_index as key and user_id as value
    '''
    user_id = list(interactions.index)
    user_dict = {}
    counter = 0 
    for i in user_id:
        user_dict[i] = counter
        counter += 1
    return user_dict

def create_item_dict(df,id_col,name_col):

    item_dict ={}
    for movie, c in df.iterrows():
      item_dict[(df.loc[movie, id_col])] = df.loc[movie, name_col]
    
    return item_dict

In [None]:
# interactions_df_split = np.array_split(interactions_df, 100)


In [None]:
'''
interactions_temp = []
# Create interaction matrix as df, where only ratings 4+ are considered positive
for df in interactions_df_split:
  interactions_temp.append(create_interaction_matrix(df, 'userId', 'movieId', 'rating', norm=True, threshold=3.4))
'''

In [None]:
interaction_matrix = create_interaction_matrix(interactions_df, 'userId', 'movieId', 'rating', norm=True, threshold=3.9)
interaction_matrix_nonbinary = create_interaction_matrix(interactions_df, 'userId', 'movieId', 'rating')

In [None]:
interactions_dense = csr_matrix(interaction_matrix).todense()
sparsity = float(len(interactions_dense.nonzero()[0]))
sparsity /= (interactions_dense.shape[0] * interactions_dense.shape[1])
sparsity *= 100
print('Sparsity: {:4.2f}%'.format(sparsity))

Sparsity: 0.86%


In [None]:
'''
interaction_matrix = None
for df in interactions_temp:
  if interaction_matrix is None:
    interaction_matrix = df
  else:
    interaction_matrix.append(df)
'''

# Create user dict
user_dict = create_user_dict(interaction_matrix)

# Create item dict
item_dict = create_item_dict(item_features_df, 'movieId', 'title')

In [None]:
item_features_df

Unnamed: 0,movieId,data,title
0,89,"[[(n02883205, bow_tie, 0.09820818), (n02951585...",Nick of Time (1995)
1,92,[],Mary Reilly (1996)
2,94,"[[(n01930112, nematode, 0.045145374), (n042865...",Beautiful Girls (1996)
3,99,"[[(n04589890, window_screen, 0.12671871), (n04...",Heidi Fleiss: Hollywood Madam (1995)
4,100,"[[(n04266014, space_shuttle, 0.58021516), (n03...",City Hall (1996)
...,...,...,...
3522,64921,"[[(n04266014, space_shuttle, 0.14096697), (n03...",Arabian Nights (1942)
3523,64926,"[[(n03424325, gasmask, 0.12416885), (n03110669...","Battle of Russia, The (Why We Fight, 5) (1943)"
3524,64944,"[[(n04239074, sliding_door, 0.13018622), (n034...",Face of a Fugitive (1959)
3525,65025,"[[(n04023962, punching_bag, 0.06733158), (n026...",Double Dynamite (1951)


In [None]:
# Change item feature dataframe so that every feature has its own column, and
# value in each column is the number for occurencies for the particular film.

for movie in item_features_df.itertuples():
  corpus = words_to_vocab(labels_to_list(movie[2])) 
  for key in corpus.keys():
    if key in item_features_df.columns and not np.isnan(item_features_df.at[movie[0], key]):
      item_features_df.at[movie[0], key] += corpus[key]
      add_to_vocab(key)
    else:
      item_features_df.at[movie[0], key] = corpus[key]
      add_to_vocab(key)


item_features_df.drop(columns='data', inplace=True)
item_features_df = item_features_df.reset_index()

In [None]:
# Convert item_features to csr matrix
item_features_df['movieId'] = item_features_df['movieId'].astype(int)
item_features_df = item_features_df.sort_values('movieId').reset_index().drop(['index', 'title'], axis=1)
item_features_df.replace(np.nan, 0, inplace=True)
item_features_csr = csr_matrix(item_features_df.drop(['movieId', 'level_0'], axis=1).values)

In [None]:
# Normalize item features

scaler = MinMaxScaler()
item_features_csr = csr_matrix(scaler.fit_transform(item_features_csr.todense()))

In [None]:
'''
Remove users with less than 5 interactions???
'''

'''
users = []
interaction_matrix = interaction_matrix
for user, content in interaction_matrix.iterrows():
  if content.sum() < 5:
    interaction_matrix.drop(index=user, inplace=True)
    
user_dict = create_user_dict(interaction_matrix)

'''

'\nusers = []\ninteractions_matrix = interactions_matrix\nfor user, content in interactions_matrix.iterrows():\n  if content.sum() < 5:\n    interactions_matrix.drop(index=user, inplace=True)\n    \nuser_dict = create_user_dict(interactions_matrix)\n\n'

In [None]:
'''
Split method that enables specification of minimum user-item
interactions for the users that are moved to the test set
'''


def train_test_split(interactions, split_count, fraction=0.2):
  
  '''
  source: https://www.ethanrosenthal.com/2016/10/19/implicit-mf-part-1/
  interactions : scipy.sparse matrix
                Interactions between users and items.     
  split_count : int         
                Number of user-item-interactions per user to move 
                from training to test set.     
  fractions : float         
              Fraction of users to split off some of their 
              interactions into test set. If None, then all 
              users are considered.
  '''
  
  train = interactions.copy().tocoo()
  test = lil_matrix(train.shape)
    
  if fraction:
    try:
      user_index = np.random.choice(
          np.where(np.bincount(train.row) >= split_count * 2)[0], 
          replace=False,
          size=np.int32(np.floor(fraction * train.shape[0]))
      ).tolist()
    except:
      print(('Not enough users with > {} '
            'interactions for fraction of {}')\
            .format(2*k, fraction))
      raise
  else:
    user_index = range(train.shape[0])
      
  train = train.tolil()

  for user in user_index:
    test_ratings = np.random.choice(interactions.getrow(user).indices, 
                                    size=split_count, 
                                    replace=False)
    train[user, test_ratings] = 0.
    # These are just 1.0 right now
    test[user, test_ratings] = interactions[user, test_ratings]
  
  
  # Test and training are truly disjoint
  assert(train.multiply(test).nnz == 0)
  return train.tocsr(), test.tocsr(), user_index



In [None]:
# Run if only users with >= 10 interactions should be moved to test set
'''
interaction_matrix_csr = csr_matrix(interaction_matrix)
train, test, user_index = train_test_split(interaction_matrix_csr, 5)
'''

'\ninteractions_matrix_csr = csr_matrix(interactions_matrix)\ntrain, test, user_index = train_test_split(interactions_matrix_csr, 5)\n'

In [None]:
# Run if num of interactions should not be considered when splitting
train, test = random_train_test_split(csr_matrix(interaction_matrix), test_percentage=0.2)
train_nonbinary, test_nonbinary = random_train_test_split(csr_matrix(interaction_matrix_nonbinary), test_percentage=0.2)

In [None]:
final_dataset['item_features'] = item_features_csr
final_dataset['interactions'] = csr_matrix(interaction_matrix)
final_dataset['train'] = train
final_dataset['test'] = test
final_dataset['full_vocab'] = full_vocab
final_dataset['user_dict'] = user_dict
final_dataset['item_dict'] = item_dict
final_dataset['train_nonbinary'] = train_nonbinary
final_dataset['test_nonbinary'] = test_nonbinary
final_dataset['interactions_df_nonbinary'] = interactions_df

In [None]:
'''
Add confidence features and subtitle features to full vocab
'''

In [None]:
conf_features_df

Unnamed: 0,movieId,labels
0,89,"{'bow_tie': 0.1502389, 'spotlight': 0.11706595..."
1,92,{}
2,94,"{'nematode': 0.045145374, 'spotlight': 0.04144..."
3,99,"{'window_screen': 0.13067606, 'English_foxhoun..."
4,100,"{'space_shuttle': 0.32458422, 'bearskin': 0.61..."
...,...,...
3522,64921,"{'space_shuttle': 0.16505651, 'digital_clock':..."
3523,64926,"{'gasmask': 0.09172113, 'chain': 0.13236615, '..."
3524,64944,"{'sliding_door': 0.1034712, 'military_uniform'..."
3525,65025,"{'punching_bag': 0.06733158, 'cinema': 0.28765..."


In [None]:
# Change item feature dataframe so that every feature has its own column, and
# value in each column is the number for occurencies for the particular film.

for movie in conf_features_df.itertuples():
  corpus = movie[2]
  for key in corpus.keys():
      conf_features_df.at[movie[0], key] = corpus[key]


conf_features_df.drop(columns='labels', inplace=True)
conf_features_df = conf_features_df.reset_index()

In [None]:
# Convert item_features to csr matrix
conf_features_df['movieId'] = conf_features_df['movieId'].astype(int)
conf_features_df = conf_features_df.sort_values('movieId').reset_index().drop(['index'], axis=1)
conf_features_df.replace(np.nan, 0, inplace=True)
conf_features_csr = csr_matrix(conf_features_df.drop(['movieId', 'level_0'], axis=1).values)

In [None]:
final_dataset['conf_features'] = conf_features_csr

In [None]:
subs_vocab = subs_df.drop(columns='movieId').sum(axis=0).to_dict()

In [None]:
subs_df['movieId'] = subs_df['movieId'].astype(int)
subs_df = subs_df.sort_values('movieId').reset_index().drop(['index'], axis=1)
subs_df.replace(np.nan, 0, inplace=True)
subs_csr = csr_matrix(subs_df.drop(['movieId', 'level_0'], axis=1).values)

In [None]:
scaler = MinMaxScaler()
subs_csr = csr_matrix(scaler.fit_transform(subs_csr.todense()))

In [None]:
final_dataset['subs_features'] = subs_csr
final_dataset['subs_vocab'] = subs_vocab

In [None]:
# Pandas profile report for interactions
'''
report = ProfileReport(interactions_df, title='Pandas profiling report', minimal=False)
report.to_file('/content/drive/My Drive/Master/profiler_interactions.html')
'''

"\nreport = ProfileReport(interactions_df, title='Pandas profiling report', minimal=False)\nreport.to_file('/content/drive/My Drive/Master/profiler_interactions.html')\n"

In [None]:
# Pandas profile report for item features
'''
item_features_norm_df = pd.DataFrame(item_features_csr.todense())
report = ProfileReport(item_features_norm_df, title='Pandas profiling report', minimal=True)
report.to_file('/content/drive/My Drive/Master/profiler_items.html')
'''

"\nitem_features_norm_df = pd.DataFrame(item_features_csr.todense())\nreport = ProfileReport(item_features_norm_df, title='Pandas profiling report', minimal=True)\nreport.to_file('/content/drive/My Drive/Master/profiler_items.html')\n"

In [None]:
pickle.dump(final_dataset, (open('/content/drive/My Drive/Master/Data/10Mdatasets.p', 'wb')))

In [None]:
'''

The next part will build a dataset for recommendations based on genre

'''

'\n\nThe next part will build a dataset for recommendations based on genre\n\n'

In [None]:
infile = open('/content/drive/My Drive/Master/Data/10Mdatasets.p', 'rb')
final_dataset = pickle.load(infile)
infile.close()

movies_df = pd.read_csv(f'{data_dir}/10Mmovies.dat', 
                        sep="::", 
                        usecols = [0, 1, 2], 
                        names = ['movieId', 'title', 'genres'], 
                        engine = 'python')

movies_df = movies_df.merge(pd.DataFrame(data=interactions_df.
                                         movieId.unique()).
                            rename(columns={0:'movieId'}), on='movieId')
movies_df.genres = movies_df.genres.apply(lambda x: list(x.split('|')))
movies_df

Unnamed: 0,movieId,title,genres
0,89,Nick of Time (1995),"[Action, Thriller]"
1,92,Mary Reilly (1996),"[Drama, Horror, Thriller]"
2,94,Beautiful Girls (1996),"[Comedy, Drama, Romance]"
3,99,Heidi Fleiss: Hollywood Madam (1995),[Documentary]
4,100,City Hall (1996),"[Drama, Thriller]"
...,...,...,...
3522,64921,Arabian Nights (1942),"[Action, Adventure]"
3523,64926,"Battle of Russia, The (Why We Fight, 5) (1943)","[Documentary, War]"
3524,64944,Face of a Fugitive (1959),[Western]
3525,65025,Double Dynamite (1951),"[Comedy, Musical]"


In [None]:
all_genres = {}

for movie in movies_df.itertuples():
  for genre in movie[3]:
    movies_df.at[movie[0], genre] = 1
    if genre not in all_genres.keys():
      all_genres[genre] = 1
    else:
      all_genres[genre] += 1
movies_df.drop(columns=['title', 'genres'], inplace=True)
movies_df.replace(np.nan, 0, inplace=True)
movies_df

Unnamed: 0,movieId,Action,Thriller,Drama,Horror,Comedy,Romance,Documentary,Adventure,Crime,Children,Musical,Mystery,War,Film-Noir,Fantasy,Sci-Fi,Western,Animation,IMAX
0,89,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,92,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,94,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,99,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,100,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3522,64921,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3523,64926,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3524,64944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3525,65025,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
movie_genres_csr = csr_matrix(movies_df.drop('movieId', axis=1).values)

In [None]:
final_dataset['genre_features'] = movie_genres_csr
final_dataset['genre_dict'] = all_genres

In [None]:
pickle.dump(final_dataset, (open('/content/drive/My Drive/Master/Data/10Mdatasets.p', 'wb')))

In [None]:
'''

The next part will build a dataset for recommendations based on tags

'''

In [None]:
infile = open('/content/drive/My Drive/Master/Data/1Mdatasets.p', 'rb')
final_dataset = pickle.load(infile)
infile.close()

tags_df = pd.read_csv(f'{data_dir}/10Mtags.dat', sep='::', usecols=[1,2], names=['movieId', 'tag'])
tags_df.head

  """


<bound method NDFrame.head of        movieId              tag
0         4973       excellent!
1         1747         politics
2         1747           satire
3         2424  chick flick 212
4         2424            hanks
...        ...              ...
95575     1377           Gothic
95576     2424      chick flick
95577     3033           comedy
95578     3081           Gothic
95579     7438          Western

[95580 rows x 2 columns]>

In [None]:
final_dataset['item_dict']

In [None]:
tags_df = tags_df.merge(pd.DataFrame(data=final_dataset['item_dict'].keys()).
                            rename(columns={0:'movieId'}).astype(int), on='movieId')

tags_df['tag'] = tags_df['tag'].astype(str).apply(lambda x: x.lower())

tags_df.head


<bound method NDFrame.head of        movieId                                tag
0         3033                              spoof
1         3033                          star wars
2         3033                             comedy
3         3033                         john candy
4         3033                       rick moranis
...        ...                                ...
20758     2836  main character doesn't masturbate
20759     1518                                fdf
20760      694                       payback time
20761     1404                     must see again
20762     2815                          nostalgia

[20763 rows x 2 columns]>

In [None]:
print(len(tags_df['movieId'].unique()))
print(len(final_dataset['item_dict'].keys()))

1514
1514


In [None]:

for movieId in final_dataset['item_dict'].keys():
  if int(movieId) not in tags_df['movieId'].values:
    tags_df = tags_df.append(pd.Series([int(movieId),''], index=tags_df.columns), ignore_index=True)


In [None]:
tags_df['tag'] = tags_df['tag'].astype('category')
trans_tags = pd.get_dummies(tags_df)
trans_tags

Unnamed: 0,movieId,tag_,"tag_""damn dirty apes""","tag_""it's not an american story","tag_""oh yah""","tag_""rosebud""","tag_""show me the money.""","tag_""underaged sex""=child porn",tag_'and just what do you think you're doing mr. smeee?',tag_'carrie i don't wanna rain on your parade but that's not fruit punch',tag_'hey dan do you mind having rabbit stew for a pick-me-up?',tag_'i can't get that theme song out of my head',tag_'i guess repression does have it's moments after all',tag_'oh spock! oh jim! oh bones!' wtf!? et tu herr doktor? heil!,tag_'oh the whore-ror!',tag_'sexy',tag_'stop banging the doorbell i'm landscaping my kitchen sink!',tag_(s)vcd,tag_*good* musicals,tag_*reps*,tag_007,tag_007 (series),tag_007 rocks,tag_05.03.06,tag_05.04.06,tag_06.04.06,tag_09.05.06,tag_1,tag_1-4-2007,tag_1.5,tag_10,tag_12-1-2007,tag_123,tag_12th century,tag_15.02.06,tag_15th century,tag_16mm,tag_17th century,tag_1800s,tag_1890s,...,tag_wwii,tag_wwii comedy-drama,tag_x-mas movie,tag_xenomorph,tag_xenophobia,tag_xenophobic?,tag_xmas,tag_xmas theme,tag_xmas theme(?),tag_yahoo top pick,tag_yakuza,tag_yeah,tag_yellow,tag_you complete me,tag_you mean it's over inflated!,tag_you might laugh your head out.,tag_you only got 2 bullets. i have a sword. you will miss. i won't.,tag_you're gonna die clown,tag_you've seen them all.,tag_young age classic,tag_young anakin,tag_young kids,tag_young women's favorate,tag_your out of ur element donnie!!!!,tag_ytutyki,tag_yul brynner,tag_yuppies,tag_zaz,tag_zibri studio,tag_ziegfeld follies,tag_zim,tag_zither,tag_zombie,tag_zombies,tag_zoo,tag_zooey deschanel,tag_.1,tag_ \t\t\t\t\tcharacter based on real person:saddam hussein,tag_â¡olivier martinez!,tag_ã‰rase una vez en amã©rica
0,3033,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,3033,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,3033,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3033,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,3033,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21038,3920,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
21039,3934,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
21040,3940,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
21041,3943,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
trans_tags = trans_tags.set_index(trans_tags['movieId']).drop(columns='movieId')
trans_tags = trans_tags.groupby('movieId', axis=0).sum()
trans_tags.drop(columns='tag_', inplace=True)


In [None]:
#tags_df = tags_df.set_index(tags_df['tag'])
#tags_df.drop(columns='movieId', inplace=True)
matrix = csr_matrix(trans_tags)
dense_matrix = matrix.todense()
sparsity = float(len(dense_matrix.nonzero()[0]))
sparsity /= (dense_matrix.shape[0] * dense_matrix.shape[1])
sparsity *= 100
print('Sparsity: {:4.2f}%'.format(sparsity))

Sparsity: 0.04%


In [None]:
trans_tags = trans_tags.reset_index()
movie_tags_csr = csr_matrix(trans_tags.drop(columns=['movieId']).values)
final_dataset['tags'] = movie_tags_csr
pickle.dump(final_dataset, (open('/content/drive/My Drive/Master/Data/1Mdatasets.p', 'wb')))

In [None]:
'''

Create interaction matrix that includes negative values
for negative ratings

'''

In [None]:
infile = open('/content/drive/My Drive/Master/Data/10Mdatasets.p', 'rb')
final_dataset = pickle.load(infile)
infile.close()
interactions_df = final_dataset['interactions_df_nonbinary']

In [None]:
def create_interaction_matrix_neg(df,user_col, item_col, rating_col, norm= False, threshold = None):
  '''
  @author: Aayush Agrawal
  @source: https://github.com/aayushmnit/cookbook/blob/master/recsys.py

  Function to create an interaction matrix dataframe from transactional type interactions
  Required Input -
      - df = Pandas DataFrame containing user-item interactions
      - user_col = column name containing user's identifier
      - item_col = column name containing item's identifier
      - rating col = column name containing user feedback on interaction with a given item
      - norm (optional) = True if a normalization of ratings is needed
      - threshold (required if norm = True) = value above which the rating is favorable
  Expected output - 
      - Pandas dataframe with user-item interactions ready to be fed in a recommendation algorithm
  '''
  interactions = df.groupby([user_col, item_col])[rating_col] \
          .sum().unstack().reset_index(). \
          fillna(0).set_index(user_col)
  if norm:
    interactions = interactions.applymap(lambda x: 1 if x > threshold else (-1 if x > 0 and x < threshold else 0))
  return interactions

In [None]:
interactions_neg_matrix = create_interaction_matrix_neg(interactions_df, 'userId', 'movieId', 'rating', norm=True, threshold=3.9)

KeyboardInterrupt: ignored

In [None]:
train_neg, test_neg = random_train_test_split(csr_matrix(interactions_neg_matrix), test_percentage=0.2)

In [None]:
final_dataset['interactions_neg'] = interactions_neg_matrix
final_dataset['train_neg'] = train_neg
final_dataset['test_neg'] = test_neg

In [None]:
pickle.dump(final_dataset, (open('/content/drive/My Drive/Master/Data/10Mdatasets.p', 'wb')))