<a href="https://colab.research.google.com/github/Ang3lino/recomenderSys/blob/master/preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preprocess
En este notebook tomamos un conjunto de datos de valoraciones de de usuarios sobre peliculas, como es costumbre no todos los usuarios han valorado todas las peliculas. Los indices de las peliculas no son contiguos.

La complejidad computacional con los filtros colaborativos es $O(m\times n^2)$, por lo que el preprocesamiento de datos creara un subconjunto usando los usuarios y los articulos mas usados y se crearan objetos con el modulo pickle pues se interesa en saber los articulos que compro el usuario $i$, los usuarios que compraron el articulo $a$ y que valoracion le dio el usuario $u$ al articulo $a$, expresado como $r_{u,a}$ reduciendo el espacio usado por la matriz y reduciendo la complejidad a $O(\Omega)$ donde $\Omega$ es unicamente el conjunto de usuarios que ha valorado algo.

El dataset puede ser descargado de: 
https://www.kaggle.com/grouplens/movielens-20m-dataset/download .

In [0]:

import numpy as np
import pandas as pd

import os
import pickle

# Counter: Coleccion capaz de contar el numero de ocurrencias de un objeto sobre otra coleccion
from collections import Counter, defaultdict
from tqdm import tqdm  # modulo cuya finalidad es dar un feedback del progreso de algun procedimiento

In [12]:
!pip install tqdm --upgrade

Requirement already up-to-date: tqdm in /usr/local/lib/python3.6/dist-packages (4.43.0)


In [0]:
def import_dataset(fdir: str, fname: str, colab=True) -> pd.DataFrame:
  # import the dataset from the cloud
  if colab:
    from google.colab import drive  
    drive.mount('/content/drive')
  fabspath = os.path.join(fdir, fname)
  return pd.read_csv(fabspath)

In [0]:
tqdm.pandas()  # con esto podremos mostrar el progreso del metodo pandas.apply como pandas.progress_apply


In [17]:

fdir = "drive/My Drive/petroleo/movielens-20m-dataset"
fname = 'rating.csv'
df = import_dataset(fdir, fname)
df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [0]:
def sequential_indexes(ints: iter):
    '''Dado algun iterable de enteros se retorna un diccionario donde se les asigna id donde estos son secuenciales.
    [2,2,2,2,3,3,5,7,11] -> [0, 1, 2, 3, 4]'''
    uniq = set(ints)
    mapper = {}
    count = 0
    for n in uniq:
        if not n in mapper:
            mapper[n] = count 
            count += 1
    return mapper 

def check_dataframe_indexes(df):
    '''Funcion de ayuda para determinar si la funcion sequential_indexes funciona adecuadamente. '''
    print(min(df.userId), max(df.userId), len(set(df.userId)))
    print(min(df.movieId), max(df.movieId), len(set(df.movieId)))  # movieId is not sequential

In [0]:
def update_indexes(df, col_name) -> None:
    '''Usando la funcion sequential_indexes actualizamos los indices del dataframe. Siento que no es necesario este paso.'''
    mapper = sequential_indexes(df.loc[:, col_name])
    print(f'Updating indexes for column {col_name}.')
    # rows_count = df.shape[0]
    # for i in tqdm(range(rows_count)):  # forma alternativa, mas lenta
    #     df.at[i, col_name] = mapper[df.at[i, col_name]]
    df.loc[:, col_name] = df.progress_apply(lambda row: mapper[row[col_name]], axis=1)  # para cada iteracion se trata a la fila, especificando axis=1
    print(f'Indexes for {col_name} have been updated succesfully.')

def shrink_df(df: pd.DataFrame, user_name: str, article_name: str, user_count: int, article_count: int) -> pd.DataFrame:
    '''Funcion que retorna un subconjunto donde cada tupla es tal que tanto el usuario como el articulo es el mas comun.'''
    user_id_counts = Counter(df[user_name])
    article_id_counts = Counter(df[article_name])
    common_users = [x for x, count in user_id_counts.most_common(user_count)]
    common_articles = [x for x, count in article_id_counts.most_common(article_count)]
    intersection = df[user_name].isin(common_users) & df[article_name].isin(common_articles)
    small_df = df[intersection].copy()
    return small_df


In [0]:
def get_small_df(df: pd.DataFrame, user_name: str, article_name: str, user_count:int, article_count:int) -> pd.DataFrame:
    small_df_name = f'small_ratings_{user_count}_{article_count}.csv'
    fabspath = os.path.join(fdir, small_df_name)
    if os.path.exists(fabspath):
        small_df = pd.read_csv(fabspath)
    else:
        # particularmente en este dataset, solo se resta una unidad para hacer los indices de 0...n-1
        df[user_name] = df[user_name] - 1  
        update_indexes(df, article_name)

        # let's remove timestamp since we don't take into consideration for the algorithm
        df = df.drop(columns=['timestamp'])  
        small_df = shrink_df(df, user_name, article_name, user_count, article_count)

        # despues de actualizar el dataframe se habra que corregir los indices
        update_indexes(small_df, user_name)
        update_indexes(small_df, article_name)
        small_df.to_csv(os.path.join(fdir, small_df_name))
    return small_df

In [0]:

user_name, article_name = 'userId', 'movieId'
user_count, article_count = 4096, 512
small_df = get_small_df(df, user_name, article_name)

In [0]:
def save_dict(fname: str, default_dictionary: defaultdict) -> None:
    # fdir, user_count, article_count
    fname = f'{fname}_{user_count}_{article_count}.json'
    with open(os.path.join(fdir, fname), 'wb') as f:
        pickle.dump(default_dictionary, f)

def build_mappers(df: pd.DataFrame, user_label: str, article_label: str, rating_label: str) -> None:
    user2art, art2user = defaultdict(list), defaultdict(list)
    user_art2rat = dict()
    for i, row in tqdm(df.iterrows(), total=df.shape[0]):
        u, a = row[user_label], row[article_label]
        user2art[u].append(a)
        art2user[a].append(u)
        user_art2rat[(u, a)] = row[rating_label] 
    save_dict('user2article', user2art)
    save_dict('article2user', art2user)
    save_dict('user_article2rating', user_art2rat)
    print('Elementos salvados satisfactoriamente.')



In [29]:
build_mappers(small_df, 'userId', 'movieId', 'rating')



100%|██████████| 1266269/1266269 [02:22<00:00, 8896.40it/s]


Elementos salvados satisfactoriamente.
