In [1]:
import sys
import os
import pandas as pd
import numpy as np
import re
import itertools
import tqdm
from rs_datasets import MovieLens

In [2]:
import pandas as pd
import numpy as np
import re
import itertools
import tqdm

import seaborn as sns
import tqdm
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup
from nltk.tokenize import TreebankWordTokenizer, WhitespaceTokenizer

import nltk
nltk.download('stopwords')
nltk.download('words')
nltk.download('punkt')
words = set(nltk.corpus.words.words())
words = set([w.lower() for w in words])

from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download("wordnet")

from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))


from nltk.tokenize import sent_tokenize

import gensim
from gensim.downloader import load
from gensim.models import Word2Vec
w2v_model = gensim.downloader.load('word2vec-google-news-300')

from typing import Dict, List, Optional, Tuple


def title_prep(title: str) -> str:
    title = re.sub(r'\s+', r' ', title)
    title = re.sub(r'($\s+|^\s+)', '', title)
    title = title.lower()
    
    return title

def extract_year(title: str) -> Optional[str]:
    one_year = re.findall(r'\(\d{4}\)', title)
    two_years = re.findall(r'\(\d{4}-\d{4}\)', title)
    one_year_till_today = re.findall(r'\(\d{4}[-–]\s?\)', title)
    if len(one_year) == 1:
        return int(one_year[0][1:-1])
    
    elif len(two_years) == 1:
        return round((int(two_years[0][1:5]) + int(two_years[0][6:-1]))/2)
    
    elif len(one_year_till_today) == 1:
        return int(one_year_till_today[0][1:5])
    else:
        return np.nan
    
def genres_processing(movies: pd.DataFrame) -> pd.DataFrame:   
    genre_lists = [set(item.split('|')).difference(set(['(no genres listed)'])) for item in movies['genres']]
    genre_lists = pd.DataFrame(genre_lists)
    
    genre_dict = {token: idx for idx, token in enumerate(set(itertools.chain.from_iterable([item.split('|') 
                for item in movies['genres']])).difference(set(['(no genres listed)'])))}
    genre_dict = pd.DataFrame(genre_dict.items())
    genre_dict.columns = ['genre', 'index']
    
    dummy = np.zeros([len(movies), len(genre_dict)])
    
    for i in range(dummy.shape[0]):
        for j in range(dummy.shape[1]):
            if genre_dict['genre'][j] in list(genre_lists.iloc[i, :]):
                dummy[i, j] = 1
    
    df_dummy = pd.DataFrame(dummy, columns = ['genre' + str(i) for i in range(dummy.shape[1])])
    
    movies_return = pd.concat([movies, df_dummy], 1)
    return movies_return

def fill_null_years(movies: pd.DataFrame) -> pd.DataFrame:
    
    df_movies = movies.copy()
    genres_columns = [item for item in movies.columns.tolist() if item[:5]=='genre' and item !='genres']
    df_no_year = movies[movies.year.isna()][['item_id', *genres_columns]]

    years_mean = {}
    for i in df_no_year.index:
    
        row = np.asarray(df_no_year.loc[i, :][genres_columns])
        years = []
        for j in np.asarray(movies[['year', *genres_columns]]):
            if np.sum(row == j[1:]) == len(genres_columns):
                try:
                    years.append(int(j[0]))
                except:
                    pass
            
        years_mean[i] = round(np.mean(years))
    
    for i in years_mean:
        df_movies.loc[i, 'year'] = years_mean[i]
    df_movies.year=df_movies.year.astype('int')
    return df_movies

def clean_text(text: str) -> str:
    text = re.sub("[^a-zA-Z]", " ",text)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\s+$", "", text)
    text = re.sub(r"^\s+", "", text)
    text = text.lower()

    return text


def procces_text(text):
    lemmatizer = WordNetLemmatizer() 

    text = [word for word in nltk.word_tokenize(text) if not word in stop_words]
    text = [lemmatizer.lemmatize(token) for token in text]
    text = [word for word in text if word in words]

    text = " ".join(text)
    
    return text

def string_embedding(string: str) -> np.ndarray:
    arr = string.split(' ')
    vec = 0
    cnt = 0
    for i in arr:
        try:
            vec += w2v_model[i]
            cnt += 1
        except:
            pass
    if cnt == 0:
        vec = np.zeros((300, 1))
    else:
        vec /= cnt
    return vec


def data_processing(df_movie: pd.DataFrame, 
                    df_rating: pd.DataFrame, 
                    df_tags: pd.DataFrame
) -> List[pd.DataFrame]:
    
    print("------------------------ Movie processing ------------------------")
    #Extraction of the movies' years and transform genres lists to genres vector
    df_movies_procc = df_movie.copy()
    df_movies_procc.title = df_movies_procc.title.apply(title_prep) #title processing
    df_movies_procc['year'] = df_movies_procc.title.apply(extract_year) #year processing
    df_movies_procc = genres_processing(df_movies_procc) #genres processing
    df_movies_procc = fill_null_years(df_movies_procc) #fillimg null year values
    
    #Creating rating_avg column 
    print("------------------------ Rating processing ------------------------")
    df_movies_procc = pd.merge(df_movies_procc, df_rating.groupby('item_id', as_index=False).rating.mean(), on='item_id', how='left')
    df_movies_procc.rating = df_movies_procc.rating.fillna(0.0)
    df_movies_procc = df_movies_procc.rename(columns={'rating' : 'rating_avg'})
    df_movies_clean = df_movies_procc.drop(['title', 'genres'], axis=1)[['item_id', 'year', 'rating_avg', *['genre' + str(i) for i in range(19)]]]
    
    print("------------------------ Tags processing ------------------------")
    df_tags_ = df_tags.drop(df_tags[df_tags.tag.isna()].index)
    df_movie_tags = df_tags_.sort_values(by=['item_id', 'timestamp'])[['item_id', 'tag', 'timestamp']]    
    df_movie_tags['clean_tag'] = df_movie_tags.tag.apply(lambda x : procces_text(clean_text(x)))
    df_movie_tags = df_movie_tags[df_movie_tags.clean_tag.str.len()!=0]
    
    print("------------------------ Tags embedding ------------------------")
    #tags text gathering
    docs_movie_tags = df_movie_tags.sort_values(["item_id", "timestamp"]).groupby("item_id", as_index=False).agg({"clean_tag":lambda x: " ".join(x)})
    df_movies_tags = pd.concat([docs_movie_tags.item_id, pd.DataFrame(docs_movie_tags.clean_tag.apply(string_embedding).to_list(), columns = ['w2v_' + str(i) for i in range(300)])], axis = 1)
    df_movies_clean = pd.merge(df_movies_clean, df_movies_tags, on = "item_id", how = "left").fillna(0.0)
    
    print("------------------------ Users processing ------------------------")
    #users procc
    df_users = df_rating.copy()
    df_users = df_users.groupby(by=['user_id'], as_index=False).rating.mean().rename(columns = {'rating' : 'rating_avg'})
    df_users_genres = pd.merge(df_movies_clean[['item_id', *df_movies_clean.columns[3:22]]], pd.merge(df_rating, df_users, on = 'user_id')[['user_id', 'item_id']],
        on = 'item_id')

    df_users_genres = df_users_genres.groupby(by = ['user_id'], as_index = False)[df_movies_clean.columns[3:22]].mean()
    df_users_genres = pd.merge(df_users_genres, df_users, on = 'user_id')
    df_pairs = pd.merge(df_rating, df_users, on = 'user_id')[['user_id', 'item_id']]
    
    print("------------------------ Users embedding ------------------------")
    users_id = []
    vect_space = []
    for Id in tqdm.tqdm(df_pairs.user_id.unique()):
        movie_list = df_pairs[df_pairs.user_id == Id].item_id.tolist()
        vect = np.asarray(df_movies_clean[df_movies_clean.item_id.isin(movie_list)][[*df_movies_clean.columns[22:]]].mean().tolist())
        users_id.append(Id)
        vect_space.append(vect)
        
    df_users_w2v = pd.DataFrame(vect_space, columns = ['w2v_' + str(i) for i in range(len(df_movies_clean.columns[22:]))])
    df_users_w2v['user_id'] = users_id
    df_users_clean = pd.merge(df_users_genres, df_users_w2v, on = 'user_id')
    df_rating_clean = df_rating[['user_id', 'item_id', 'rating', 'timestamp']]

    df_movies_clean = df_movies_clean.rename(columns={'item_id': 'item_idx'})
    df_users_clean = df_users_clean.rename(columns={'user_id': 'user_idx'})
    df_rating_clean = df_rating_clean.rename(columns={'item_id': 'item_idx', 'user_id': 'user_idx', 'rating': 'relevance'})
    
    return [df_movies_clean, df_users_clean, df_rating_clean]

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/syudosaev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /home/syudosaev/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to /home/syudosaev/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/syudosaev/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
data = MovieLens("20m", read_genome=True)
df_movie = data.items
df_rating = data.ratings
df_tags = data.tags

In [4]:
cat_dict_movies = pd.Series(df_movie.item_id.astype("category").cat.codes.values, index=df_movie.item_id).to_dict()
cat_dict_users = pd.Series(df_rating.user_id.astype("category").cat.codes.values, index=df_rating.user_id).to_dict()

In [5]:
df_movie.item_id = df_movie.item_id.apply(lambda x: cat_dict_movies[x])

df_rating.item_id = df_rating.item_id.apply(lambda x: cat_dict_movies[x])
df_rating.user_id = df_rating.user_id.apply(lambda x: cat_dict_users[x])

df_tags.item_id = df_tags.item_id.apply(lambda x: cat_dict_movies[x])
df_tags.user_id = df_tags.user_id.apply(lambda x: cat_dict_users[x])

In [6]:
QUANTILES = [0.5, 0.75]
df_rating = df_rating.sort_values(by='timestamp').reset_index(drop=True)
quantiles_values = [df_rating.timestamp.quantile(i) for i in QUANTILES]
print(quantiles_values)

df_rating_train = df_rating[df_rating.timestamp <= quantiles_values[0]]
print(f"DataFrame size: {df_rating_train.shape}")

df_rating_val = df_rating[(df_rating.timestamp <= quantiles_values[1])]
print(f"DataFrame size: {df_rating_val.shape}")

df_rating_test = df_rating.copy()
print(f"DataFrame size: {df_rating_test.shape}")

[1103555886.0, 1225642317.5]
DataFrame size: (10000132, 4)
DataFrame size: (15000197, 4)
DataFrame size: (20000263, 4)


In [7]:
df_items_train, df_users_train, df_rating_train = data_processing(df_movie, df_rating_train, df_tags)

------------------------ Movie processing ------------------------


  movies_return = pd.concat([movies, df_dummy], 1)


------------------------ Rating processing ------------------------
------------------------ Tags processing ------------------------
------------------------ Tags embedding ------------------------
------------------------ Users processing ------------------------
------------------------ Users embedding ------------------------


100%|██████████| 80650/80650 [57:57<00:00, 23.19it/s]  


In [9]:
df_items_train.to_csv("train/items.csv", index=False)
df_users_train.to_csv("train/users.csv", index=False)
df_rating_train.to_csv("train/rating.csv", index=False)

In [10]:
df_items_val, df_users_val, df_rating_val = data_processing(df_movie, df_rating_val, df_tags)
df_rating_val = df_rating_val[df_rating_val.timestamp > quantiles_values[0]]
df_items_val.to_csv("val/items.csv", index=False)
df_users_val.to_csv("val/users.csv", index=False)
df_rating_val.to_csv("val/rating.csv", index=False)

------------------------ Movie processing ------------------------


  movies_return = pd.concat([movies, df_dummy], 1)


------------------------ Rating processing ------------------------
------------------------ Tags processing ------------------------
------------------------ Tags embedding ------------------------
------------------------ Users processing ------------------------
------------------------ Users embedding ------------------------


100%|██████████| 106573/106573 [1:27:41<00:00, 20.26it/s] 


In [14]:
df_items_test, df_users_test, df_rating_test = data_processing(df_movie, df_rating_test, df_tags)

------------------------ Movie processing ------------------------


  movies_return = pd.concat([movies, df_dummy], 1)


------------------------ Rating processing ------------------------
------------------------ Tags processing ------------------------
------------------------ Tags embedding ------------------------
------------------------ Users processing ------------------------
------------------------ Users embedding ------------------------


100%|██████████| 138493/138493 [2:06:56<00:00, 18.18it/s]  


In [None]:
df_rating_test = df_rating_test[df_rating_test.timestamp > quantiles_values[1]]
df_items_test.to_csv("test/items.csv", index=False)
df_users_test.to_csv("test/users.csv", index=False)
df_rating_test.to_csv("test/rating.csv", index=False)