In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from IPython.display import display

In [None]:
def _to_datetime(data, column_name="updated_at"):
    """
    Converts column to datetime format
    
    Args:
        data (pandas.dataframe) : data on which said action needs to be performed.
        column_name (str) : column which needs to be converted to datetime
        
    Returns:
            data (pandas.dataframe) : updated data with datetime column.
    """
    data[column_name] = pd.to_datetime(data[column_name])

    return data

In [None]:
def _sort_values(data, column_name="updated_at"):

    """
    Sort values according to a specific column
    
    Args:
        data (pandas.dataframe) : data on which said action needs to be performed.
        column_name (str) : column on which the dataframe needs to be sorted.
        
    Returns:
            data (pandas.dataframe) : updated data with sorted rows.
    """
    data = data.sort_values(column_name).reset_index(drop=True)

    return data

In [None]:
def _user_cleanup(data, user_column_name="user_id", threshold=10):

    """
    Clean-up the data by dropping all users that have not read books more than the threshold
    
    Args:
        data (pandas.dataframe) : data on which said action needs to be performed.
        user_column_name (str) : column which signifies the user_id.
        threshold (int) : number of books the user should have read.
        
    Returns:
            data (pandas.dataframe) : updated data without the unnecessary rows.
    """

    x = data[user_column_name].value_counts() > threshold
    y = x[x].index  # user_ids
    data = data[data["user_id"].isin(y)]

    return data

In [None]:
def _final_merge(
    user_data,
    metadata,
    metadata_columns=["pratilipi_id", "category_name"],
    merge_column="pratilipi_id",
):

    """
    Merging the userdata and metadata on a specified merge_column.
    
    Args:
        user_data (pandas.dataframe) : user interaction data that has been cleaned up, sorted.
        metadata (pandas.dataframe) : metadata without the column 'updated_at'
        metadata_columns (list) : the columns you want to use from the metadata table during the final merge.
        merge_column (str) : column on which you want to merge the two dataframes.
        
    Returns:
            complete_data (pandas.dataframe) : final compiled data after merging user_data and metadata.
    """

    complete_data = user_data.merge(metadata[metadata_columns], on=merge_column)

    return complete_data

In [None]:
def _drop_duplicates(data, columns=["user_id", "pratilipi_id"], keep="last"):

    """
    Helper function to help drop columns or a list of columns from a dataframe
    
    Args:
        data (pandas.dataframe) : data on which said action needs to be performed.
        columns (list) : all columns we want to drop from the dataframe
        keep (str) : specifies whether you want to keep the first of last instance of the duplicate value.
        
    Returns:
            data (pandas.dataframe) : updated data with the columns we want. 
    """
    data = data.drop_duplicates(columns, keep=keep).reset_index(drop=True)

    return data

In [None]:
def _label_encode(data, value="category_id", encode_column="category_name"):

    """
    Helper function to label encode the categorical columns we want to use as values during the pivot.
    
    Args:
        data (pandas.dataframe) : data on which said action needs to be performed.
        value (str) : final name of the encoded column.
        encode_column (str) : name of the column we want to encode.
    
    Returns:
            data (pandas.dataframe) : updated dataframe with encoded columns.
    """

    le = preprocessing.LabelEncoder()

    data[value] = le.fit_transform(data[encode_column])

    return data

In [None]:
def _prune_data(
    complete_data,
    index="pratilipi_id",
    column="user_id",
    value="category_id",
    threshold=10,
):

    """
    Helper function to prune the complete data where only categories which entries more than the threshold are to be kept.
    
    Args:
        complete_data (pandas.dataframe) : data on which said action needs to be performed.
        index (str) : index for the pivot table.
        column (str) : columns we want for the pivot table.
        value (str) : values to be used in the pivot table.
        threshold (int) : minimum number of categories.
        
    Returns:
            final_val (pandas.dataframe) : Final dataframe with all the values we want.
    """

    count_val = complete_data.groupby(index)[value].count().reset_index()

    count_val.rename(columns={value: "count_{}".format(value)}, inplace=True)

    final_val = complete_data.merge(count_val, on=index)

    final_val = final_val[final_val["count_{}".format(value)] >= threshold]
    final_val.drop_duplicates([column, index], inplace=True)

    return final_val

In [None]:
def _get_pivot(
    final_val,
    threshold=216521472,
    index="pratilipi_id",
    column="user_id",
    value="category_id",
):

    """
    Helper function to pivot the final value table.
    
    Args:
        final_val (pandas.dataframe) : Final dataframe with all the values we want.
        threshold (int) : due to memory limitation I was forced to use this constraint.
        index (str) : index for the pivot table.
        column (str) : columns we want for the pivot table.
        value (str) : values to be used in the pivot table.
        
    Returns:
            table_pivot (pandas.dataframe) : pivot of the table for us to find K nearest neighbors.
    """

    table_pivot = final_val.iloc[:threshold].pivot_table(
        columns=column, index=index, values=value
    )

    table_pivot.fillna(0, inplace=True)

    return table_pivot

In [None]:
def _get_most_popular(complete_data):

    """
    A function to get a list of the most popular pratilipis from each category. Used incase we aren't able to find a nearest neighbor.
    
    Args:
        complete_data (pandas.dataframe) : data on which said action needs to be performed.
        
    Returns:
            popular_pratilipis (dict) : a dictionary with a list of pratilipis descending according to their popularity.
    """

    categs = complete_data["category_name"].unique().tolist()

    popular_pratilipis = {}

    for category_name in categs:

        data = complete_data[complete_data["category_name"] == category_name]

        pratilipi_counts = (
            data.groupby("pratilipi_id")["category_name"].count().reset_index()
        )
        pratilipi_counts.rename(
            columns={"category_name": "count_pratilipi"}, inplace=True
        )

        data = data.merge(pratilipi_counts, on="pratilipi_id")
        data = data.sort_values("count_pratilipi", ascending=False).reset_index(
            drop=True
        )

        list_pratilipi = data["pratilipi_id"].unique().tolist()

        popular_pratilipis.update({category_name: list_pratilipi})

    return popular_pratilipis

In [None]:
def _get_sparse(table_pivot):

    """
    Helper function to get a sparse matrix of the pivot table to be able to fit in the KNN algorithm.
    
    Args:
        table_pivot (pandas.dataframe) : pivot of the table for us to find K nearest neighbors.
        
    Returns:
            table_sparse (csr matrix) : a sparse matrix of the pivot table.
    """

    table_sparse = csr_matrix(table_pivot)

    return table_sparse

In [None]:
def get_dataframe(filepath):

    """
    Main function to read a csv from the file specified and return a dataframe.
    
    Args:
        filepath (str) : path to the csv file.
    
    Returns:
            sample (pandas.dataframe) : csv loaded as a dataframe.
    """
    sample = pd.read_csv(filepath)

    if set(["Unnamed: 0"]).issubset(sample.columns):
        sample.drop("Unnamed: 0", axis=1, inplace=True)

    return sample

In [None]:
def process_data(
    user_data,
    metadata,
    datetime_column="updated_at",
    user_threshold=10,
    user_column_name="user_id",
    test_size=0.25,
    merge_column="pratilipi_id",
    duplicate_columns=["user_id", "pratilipi_id"],
    metadata_columns=["pratilipi_id", "category_name"],
    keep="last",
    label_column="category_name",
    index="pratilipi_id",
    column="user_id",
    value="category_id",
    category_threshold=10,
    table_threshold=1000000,
):

    """
    Main function to process all the data and return all necessary values.
    
    Args:
        user_data (pandas.dataframe) : user interaction data that has been cleaned up, sorted.
        metadata (pandas.dataframe) : metadata without the column 'updated_at'
        datetime_column (str) : name of the column that needs to be converted to datetime.
        user_threshold (int) : minimum number of books the user must have read.
        user_column_name (str) : name of the column that holds the user id.
        test_size (float) : percentage of the total data we want to use as test data.
        merge_column (str) : column on which you want to merge the two dataframes.
        duplicate_columns (list) : list of all columns using which we want to drop the duplicated rows.
        metadata_columns (list) : the columns you want to use from the metadata table during the final merge.
        keep (str) : specifies whether you want to keep the first of last instance of the duplicate value.
        label_column (str) : name of the column we want to encode.
        index (str) : index for the pivot table.
        column (str) : columns we want for the pivot table.
        value (str) : values to be used in the pivot table.
        category_threshold (int) : minimum number of categories that need to be present.
        table_threshold (int) : memory constraint.
        
    Returns:
            table_pivot (pandas.dataframe) : pivot of the table for us to find K nearest neighbors.
            table_sparse (csr matrix) : a sparse matrix of the pivot table.
            test_data (pandas.dataframe) : split of the combined data to be used for testing.
            complete_data (pandas.dataframe) : compiled data.
            metadata (pandas.dataframe) : updated metadata.
            user_data (pandas.dataframe) : updated user_data.
    
    """

    print("converting columns to datetime")
    user_data = _to_datetime(user_data, datetime_column)
    #     metadata = _to_datetime(metadata, datetime_column)
    print("---------------------------------------")

    print("sorting dataframe according to date")
    user_data = _sort_values(user_data, datetime_column)
    #     metadata = _sort_values(metadata, datetime_column)
    print("---------------------------------------")

    print("cleaning up user data")
    user_data = _user_cleanup(user_data, user_column_name, user_threshold)
    print("---------------------------------------")

    print("dropping duplicate pratilipi ids from metadata")
    metadata = _drop_duplicates(metadata, ["pratilipi_id"], keep)

    print("dropping updated_at column from metadata")
    metadata.drop("updated_at", axis=1, inplace=True)
    print("---------------------------------------")

    print("merging user data and metadata on pratilipi_id")
    complete_data = _final_merge(user_data, metadata, metadata_columns, merge_column)
    print("---------------------------------------")

    print("dropping duplicates on complete data")
    complete_data = _drop_duplicates(complete_data, duplicate_columns, keep)
    print("---------------------------------------")

    print("sorting complete data according to date")
    complete_data = _sort_values(complete_data, datetime_column)
    print("---------------------------------------")

    print("splitting train and test data")
    train_data, test_data = train_test_split(
        complete_data, test_size=test_size, shuffle=False
    )
    print("---------------------------------------")

    display(train_data.head())

    print("encoding labels")
    train_data = _label_encode(train_data, value, label_column)
    #     train_data = _label_encode(train_data, 'user_id', 'user_id')
    #     train_data = _label_encode(train_data, 'pratilipi_id', 'pratilipi_id')
    print("---------------------------------------")

    display(train_data.head())

    print("pruning data")
    final_val = _prune_data(train_data, index, column, value, category_threshold)
    print("---------------------------------------")

    display(final_val.head())

    print("getting pivot table")
    table_pivot = _get_pivot(final_val, table_threshold, index, column, value)
    print("---------------------------------------")

    print("get sparse table")
    table_sparse = _get_sparse(table_pivot)
    print("---------------------------------------")

    display(test_data.head())

    return table_pivot, table_sparse, test_data, complete_data, metadata, user_data

In [None]:
def fit(table_sparse, n_neighbors=10, algorithm="brute"):

    """
    Main function used to fit the sparse matrix into a model to be used later to predict K nearest neighbors.
    
    Args:
        table_sparse (csr matrix) : a sparse matrix of the pivot table.
        n_neighbors (int) : number of neighbors we want to predict.
        algorithm (str) : the algorithm we want to use to build our model.
        
    Returns:
            model (sklearn.neighbors._unsupervised.NearestNeighbors) : model that can be used to predict K nearest books.
    """
    model = NearestNeighbors(n_neighbors=n_neighbors, algorithm=algorithm)
    model.fit(table_sparse)

    return model

In [None]:
def _get_user_prediction_stats(
    user_id, popular_pratilipis, test_data, train_pivot, metadata, model
):

    """
    Helper function to get prediction performance statitics on a per-user basis.
    
    Args:
        user_id (int) : user id.
        popular_pratilipis (dict) : a dictionary with a list of pratilipis descending according to their popularity.
        test_data (pandas.dataframe) : split of the combined data to be used for testing.
        train_pivot (pandas.dataframe) : pivot of the table for us to find K nearest neighbors.
        metadata (pandas.dataframe) : updated metadata containing information about pratilipis.
        model (sklearn.neighbors._unsupervised.NearestNeighbors) : model that can be used to predict K nearest books.
        
    Returns:
            c_p (float) : percentage of cases where the category of the prediction matched that of the actual pratilipis.
            p_p (float) : percentage of cases where the pratilipi_id of the prediction matched that of the actual pratilipis.
            final_pred (list) : list of top 5 pratilipi predictions.
    """

    c_p = 0.0
    p_p = 0.0
    final_pred = []
    books_pred, user_category, user_pratilipi = _get_pred_dict(
        user_id, popular_pratilipis, test_data, train_pivot, model
    )
    if len(books_pred) != 0:
        final_pred = _get_predictions(books_pred)
        c_p, p_p = _evaluate_predictions(
            final_pred, metadata, user_category, user_pratilipi
        )

    return c_p, p_p, final_pred

In [None]:
def _predict(model, table_pivot, book_id):

    """
    Helper function to predict K nearest pratilipis from the specified book_id.
    
    Args:
        model (sklearn.neighbors._unsupervised.NearestNeighbors) : model that can be used to predict K nearest books.
        table_pivot (pandas.dataframe) : pivot of the table for us to find K nearest neighbors.
        book_id (int) : pratilipi id of the book you want to use to predict its neighbors.
        
    Returns:
            distances (list) : list of distances of the prediction from the actual value.
            suggestions (list) : list of all suggestion.
    """

    distances, suggestions = model.kneighbors(
        table_pivot.loc[table_pivot.index == book_id].values.reshape(1, -1)
    )

    return distances, suggestions

In [None]:
def _get_pred_dict(user_id, popular_pratilipis, test_data, train_pivot, model):

    """
    Helper function to get a dictionary of all predicted pratilipis and their distances from the actual pratilipi.
    
    Args:
        user_id (int) : user id.
        popular_pratilipis (dict) : a dictionary with a list of pratilipis descending according to their popularity.
        test_data (pandas.dataframe) : split of the combined data to be used for testing.
        train_pivot (pandas.dataframe) : pivot of the table for us to find K nearest neighbors.
        model (sklearn.neighbors._unsupervised.NearestNeighbors) : model that can be used to predict K nearest books.

    Returns:
            books_pred (dict) : a dictionary of all predicted pratilipis and their distances from the actual pratilipi.
            user_category (list) : list of all the categories that the user has read.
            user_pratilipi (list) : list of all the pratilipis the user has read.
    """

    user = test_data.loc[test_data["user_id"] == user_id]
    user = user.sort_values("updated_at", ascending=False).reset_index(drop=True)
    user_pratilipi = user["pratilipi_id"].tolist()
    user_category = user["category_name"].tolist()
    used_category = {}
    unique_ids = train_pivot.index.unique().tolist()

    books_pred = {}

    i = 0
    for idd in user_pratilipi:
        if idd in unique_ids:
            distances, suggestions = _predict(model, train_pivot, idd)

            suggested_id = train_pivot.index[suggestions[0][1]]
            distance = distances[0][1]
        else:
            if user_category[i] in used_category:
                n = used_category[user_category[i]] + 1
            else:
                n = 0

            used_category.update({user_category[i]: n})

            suggested_id = popular_pratilipis.get(user_category[i])[n]
            distance = 100.0

        books_pred.update({suggested_id: distance})

        i += 1

    return books_pred, user_category, user_pratilipi

In [None]:
def _get_predictions(books_pred):

    """
    Helper function to get the top K predictions from the dictionary.
    
    Args:
        books_pred (dict) : a dictionary of all predicted pratilipis and their distances from the actual pratilipi.
    
    Returns:
            final_pred (list) : list of top K predictions.
    """

    final_pred = []
    i = 0
    for w in sorted(books_pred, key=books_pred.get, reverse=False):
        if i < 5:
            final_pred.append(w)
        else:
            break
        i += 1

    return final_pred

In [None]:
def _evaluate_predictions(final_pred, metadata, user_category, user_pratilipi):

    """
    A function to evaluate the predictions made by the model.
    
    Args:
        final_pred (list) : list of top K predictions.
        metadata (pandas.dataframe) : updated metadata containing information about pratilipis.
        user_category (list) : list of all the categories that the user has read.
        user_pratilipi (list) : list of all the pratilipis the user has read.
        
    Returns:
            category_match_percentage (list) : list of all category matches percentage.
            pratilipi_match_percentage (float) : list of all pratilipi matches percentage.
    """

    total = len(final_pred)
    category_match = 0
    pratilipi_match = 0
    category_match_percentage = 0.0
    pratilipi_match_percentage = 0.0
    if total != 0:
        for pred in final_pred:
            category = metadata.loc[metadata["pratilipi_id"] == pred][
                "category_name"
            ].tolist()[0]
            if category in user_category:
                category_match += 1
            if pred in user_pratilipi:
                pratilipi_match += 1

        category_match_percentage = 100 * (category_match / total)
        pratilipi_match_percentage = 100 * (pratilipi_match / total)

    return category_match_percentage, pratilipi_match_percentage

In [None]:
def _add_to_test_data(test_data, categ_perc, pratilipi_perc):

    """
    Helper function to add the evaluation metrics as a column to the test data to be able to evaluate on a per user basis.
    
    Args:
        test_data (pandas.dataframe) : split of the combined data to be used for testing.
        categ_perc (list) : list of all category matches percentage.
        pratilipi_perc (float) : list of all pratilipi matches percentage.
        
    Returns:
            test_data (pandas.dataframe) : updated test data with all evaluation metrics.
    """

    test_data["category_match_percentage"] = categ_perc
    test_data["pratilipi_match_percentage"] = pratilipi_perc

    return test_data

In [None]:
def predict_and_evaluate(test_data, complete_data, metadata, train_pivot, model):

    """
    Main function to predict and evaluate on the entire test dataset.
    
    Args:
        test_data (pandas.dataframe) : data that was split off to be used for testing.
        complete_data (pandas.dataframe) : processed and compiled data.
        metadata (pandas.dataframe) : updated metadata containing information about the pratilipis.
        train_pivot (pandas.dataframe) : pivotted table.
        model (sklearn.neighbors._unsupervised.NearestNeighbors) : model that can be used to predict K nearest books.
    
    
    Returns:
            test_data (pandas.dataframe) : updated and final test data with all evaluation metrics.
    
    """

    user_ids = test_data["user_id"].tolist()

    popular_pratilipis = _get_most_popular(complete_data)

    categ_match_perc = []
    pratilipi_match_perc = []

    for u_id in user_ids:
        cp, pp, fin_pred = _get_user_prediction_stats(
            u_id, popular_pratilipis, test_data, train_pivot, metadata, model
        )
        categ_match_perc.append(cp)
        pratilipi_match_perc.append(pp)

    test_data = _add_to_test_data(test_data, categ_match_perc, pratilipi_match_perc)

    return test_data