In [533]:
import math

import pandas as pd
import numpy as np

In [534]:
data = pd.read_csv("rating_final.csv", sep=";")
data = data[["userID", "placeID", "rating"]]

In [535]:
data.head()

Unnamed: 0,userID,placeID,rating
0,1077,135085,3
1,1077,135038,3
2,1077,132825,3
3,1077,135060,2
4,1068,135104,2


## Create utility matrix

In [536]:
# Create the utility matrix
umatrix = data.pivot(index="userID", columns="placeID", values="rating")

# Count number of reviews for users
num_of_reviews = umatrix.count(axis=1)

In [537]:
undefined_values = umatrix.isna().sum().sum()
total_values = umatrix.shape[0] * umatrix.shape[1]

sparsity = int(undefined_values / total_values * 10000)
print(f"{sparsity/100.0}% empty")

93.52% empty


## Matrix factorization

In [538]:
# Replace NaN with 0
umatrix.fillna(0, inplace=True)

umatrix

placeID,132560,132561,132564,132572,132583,132584,132594,132608,132609,132613,...,135080,135081,135082,135085,135086,135088,135104,135106,135108,135109
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0
1003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0
1005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1134,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
1135,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1136,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


## Helper functions
Cosine similarity $ similarity = cos(0) = \frac{A \cdot B}{|{A}||{B}|}$ the only related Linear Algebra concept that I actually understand and can write myself

In [539]:
def cos_similarity(u, v):
    dot_product = np.dot(u, v)
    return dot_product / np.linalg.norm(u) * np.linalg.norm(v)

In [540]:
def find_similar(user_id, util_matrix):
    """
    Courtesy of
    https://machinelearningmastery.com/using-singular-value-decomposition-to-build-a-recommender-system/
    minor changes made
    """

    # Factorize using Numpy SVD
    u, s, v = np.linalg.svd(util_matrix, full_matrices=False)

    # Get index of userID
    reset_index = util_matrix.reset_index()
    index_to_user = reset_index.copy()["userID"]
    target = reset_index[reset_index["userID"] == user_id].index.values

    for i in range(0, util_matrix.shape[0]):
        if util_matrix.iloc[i].name == user_id:
            target = i

    highest_similarity = -np.inf
    highest_sim_col = -1
    # Iterate through columns
    for col in range(0,v.shape[0]):
        if col == target:
            continue

        # Get the similarity of the target col and current
        similarity = cos_similarity(v[target], v[col])
        if similarity > highest_similarity:
            highest_similarity = similarity
            highest_sim_col = col
    return index_to_user[highest_sim_col]

In [541]:
most_similar = find_similar(1002, umatrix)
print("most similar user %s" % most_similar)

most similar user 1129


In [542]:
class Recommender:

    def __init__(self, util_matrix):
        self.util_matrix = util_matrix
        self.max_recommendations = 5

    def cf_recommend(self, user_id):
        """
        :param user_id: id of the user to recommend to
        :return: set of recommendations
        """
        recommendations_ = list()

        # Find most similar user
        most_similar_user = find_similar(user_id, self.util_matrix)

        # Get rows for both, a bit hacky but n-th user has the id n + 1001
        user_row    = self.util_matrix.loc[user_id].values.flatten().tolist()
        similar_row = self.util_matrix.loc[most_similar_user].values.flatten().tolist()

        # Loop through rows 3 times
        for i in range(0,3):
            for j in range(0,len(user_row)):
                # Append to list according to the following rules:
                # 1. If user row cell j already contains values, we should not recommend
                # 2. First append the items with greatest reviews, this is why we loop 3 times
                if user_row[j] == 0 and similar_row[j] > (3-i):
                    recommendations_.append(j)

        # Truncate recommendation list
        recommendations_ = recommendations_[0:self.max_recommendations]
        return recommendations_

Some users don't get recommendations

In [543]:
recommender = Recommender(umatrix)
recommendations = recommender.cf_recommend(1063)
print(recommendations)

[]


I'm not sure what causes this, but it might be a symptom of the greedy way the algorithm above picks the most similar user. A perfect match (cosine similarity of 1) would mean there's nothing to recommend
because both users have rated the exact same places exactly the same. Even if the rating isn't exact, as long as both have reviewed the same places, there is no recommendation done.

A less similar user could l be close enough to make good recommendations, but still differ, making recommendations.

## Cascade recommender
My initial idea was to tackle the cold start problem of collaborative filtering, by augmenting the users row with content based recommendations if a user only had a 1-3 reviews.

But after finally getting my recommender to work a week before the deadline I decieded this would be too difficult.

I still wanted to implement some sort of hybrid recommender and found the Cascade model to work well for what I so far had made.

![Graph for Cascade hybrid recommender](cascade.jpeg "Cascade hybrid recommender")
courtesy of: https://medium.com/analytics-vidhya/7-types-of-hybrid-recommendation-system-3e4f78266ad8

The idea is simple: Switch RS given a certain state or condition. As I discovered that some rows don't get any recommendations, it would be nice to have a "backup" RS.

Innitially I thought this was a switching model, but I realized that the switching happens after an initial recommendation, making this in fact a Cascade hybrid (or close to atleast).

# THE FOLLOWING BLOCK CONTAINS THE SAME CODE AS `rs.ipynb`, IT'S ONLY HERE BECAUSE THIS WOULD BE MORE PORTABLE THAN IMPORTING A LIBRARY TO IMPORT NOTEBOOK FILES INTO OTHERS

In [544]:
data = pd.read_csv("geoplaces2.csv", sep=";")
# Remove unwanted features
data = data[["placeID", "alcohol", "smoking_area", "dress_code", "price", "ambience"]]
from sklearn.preprocessing import LabelEncoder

# One hot encoding for smoking area
data.join(pd.get_dummies(data["smoking_area"], prefix='smoking_area', columns=["no_smoking"]))

# Label encoding for alcohol
alcoholEncoder = LabelEncoder()
alcoholEncoder.fit(["no_alcohol", "wine_beer", "full_bar"])
data["alcohol"] = alcoholEncoder.transform(data["alcohol"])

# Label encoding for smoking area
smokingEncoder = LabelEncoder()
smokingEncoder.fit(["none", "area", "bar", "permitted"])
data["smoking_area"] = smokingEncoder.transform(data["smoking_area"])

# Label encoding for dress code
dressEncoder = LabelEncoder()
dressEncoder.fit(["casual", "informal", "formal"])
data["dress_code"] = dressEncoder.transform(data["dress_code"])

# Label encoding for price
priceEncoder = LabelEncoder()
priceEncoder.fit(["low", "medium", "high"])
data["price"] = priceEncoder.transform(data["price"])

# Label encoding for ambience
ambienceEncoder = LabelEncoder()
ambienceEncoder.fit(["quiet", "familiar"])
data["ambience"] = ambienceEncoder.transform(data["ambience"])
def find_similar_places(df, place_id, max_error):
    matches = []
    # Get first row with matching placeID
    target_row = df[df['placeID'] == place_id].iloc[0]
    for index, row in df.iterrows():

        # Assert that everything is the same length
        if len(row) != len(target_row):
            return []

        # Skip self
        if row["placeID"] == place_id:
            continue

        sigma = 0
        # Add values
        for idx, dp in enumerate(target_row):
            # Skip placeID
            if idx == 0:
                continue

            error = abs(target_row[idx] - row[idx])
            sigma += error

        # Append rows that do not differ
        if sigma < max_error:
            matches.append(row)

    return matches

# Finishing up the Cascade recommender
Here I'm inheriting the original Recommender, and initializing it. It then attempts a recommendation with the parent Recommender, if this returns an empty list, it uses the content based recommender instead

In [545]:
class CascadeRecommender(Recommender):
    def __init__(self, util_matrix):
        super().__init__(util_matrix)

    def recommend(self, user_id):
        all_place_ids = self.util_matrix.columns.values.tolist()
        recs = Recommender.cf_recommend(self, user_id)
        # Switch to content based recommender if collaborative did not return anything
        selected_recs = []
        if len(recs) == 0:
            # Get user row
            user_row = self.util_matrix.loc[user_id].values.flatten().tolist()

            # Get indeces of non-zero values, i.e. places rated by user
            indeces = [i for i, e in enumerate(user_row) if e != 0]

            # Find entry with highest rating
            highest_rating = 0
            highest_rated_place = 0

            # Find index of highest rated place by user, we don't want to make content based recommendations on items the user didn't like
            for i in indeces:
                if user_row[i] > highest_rating:
                    highest_rated_place = i
                    highest_rating = user_row[i]

            # Use content based recommender
            selected_recs = find_similar_places(data, all_place_ids[highest_rated_place], 1)

            # In this implementation we don't care about specifics of the places, omit everything but placeID
            selected_recs = [i["placeID"] for i in selected_recs]

            # Truncate recommendation list
            selected_recs = selected_recs[0:self.max_recommendations]
        else:
            selected_recs = list()
            for i in recs:
                selected_recs.append(all_place_ids[i])

        return selected_recs

In [530]:
hybrid = CascadeRecommender(umatrix)
print(hybrid.recommend(1063))

[134999, 135082, 135070, 135086, 135042]


# Summary
Here I have created a recommender system that creates a utility matrix and factorizes it. It then checks for similarity between the hidden features of V and returns the closest ones.

A Recommender class serves as an interface for the collaborative RS, nicely only taking the utility matrix as a parameter at initialization. It's then inherited by a CascadeRecommender class that can switch over to content recommendation, in case the collaborative RS fails to make a recommendation.