In [14]:
import numpy as np
import pandas as pd
import re

In [15]:
DATASET_PATH = "./Data2.csv"
COLUMNS = ["id_review", "rating", "review_title", "review_text", "user_pseudo", "user_location", "hotel_id", "date_stayed", "date_review"]

In [17]:
RE = re.compile(r"^(\d+);;(\d.\d);;“(.*)”;;(.*);;(.*);;(.*);;(\d*);;(.*);;(.*)$")


In [19]:
with open(DATASET_PATH, 'r', encoding="utf8") as f:
    data = []
    broken_lines = 0
    for line in f:
        match = RE.fullmatch(line.strip())
        if match:
            fields = match.groups()

            # if the correct number of fields where found
            if len(fields) == len(COLUMNS):
                data.append({column: field for column, field in zip(COLUMNS, fields)})
            else:
                broken_lines += 1
                #print("Not the correct number of match on this line:")
                #print(line)
        else:
            broken_lines += 1
            #print("No match on this line:")
            #print(line)
            
    
  # print(f"{len(data)} correctly parsed lines, {broken_lines} incorrectly parsed lines")
#df = pd.read_csv(DATASET_ARCHIVE_PATH, sep='\;\;', names=COLUMNS, header=None, error_bad_lines=False)
#clear_output()
df = pd.DataFrame.from_records(data)
print(f"Chargement des données fini, {len(data)} correctly parsed lines, {broken_lines} incorrectly parsed lines")


Chargement des données fini, 878554 correctly parsed lines, 14 incorrectly parsed lines


In [22]:
df = df.dropna(subset=['rating','hotel_id'])

## Encoding user

In [23]:
df['user_pseudo'] = df['user_pseudo'].fillna("Anon")

In [24]:
from sklearn import preprocessing

le_user  = preprocessing.LabelEncoder()
le_user.fit(df['user_pseudo'].values)



LabelEncoder()

In [25]:
df['user_pseudo'] = le_user.transform(df['user_pseudo'].values)

In [26]:
USER_COUNT = len(le_user.classes_)

# Encoding hotel

In [27]:
le_hotel =  preprocessing.LabelEncoder()
le_hotel.fit(df['hotel_id'].values)


LabelEncoder()

In [28]:
df['hotel_id'] = le_hotel.transform(df['hotel_id'].values)

In [29]:
HOTEL_COUNT = len(le_hotel.classes_)


## Score harmonization

In [30]:
df['rating'] = df['rating'].fillna(-1) 

## Matrix factorization

In [31]:
from surprise import SVD,NMF,SVDpp
from surprise import Dataset
from surprise import Reader
from surprise import NormalPredictor



from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise import accuracy




## Test

In [33]:



rating_df = pd.DataFrame()
rating_df["rating"] = df['rating']
rating_df["user_pseudo"] = df["user_pseudo"]
rating_df["hotel_id"] = df["hotel_id"]

In [35]:
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(0, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[['user_pseudo', 'hotel_id', 'rating']], reader)


In [36]:
# sample random trainset and testset
# test set is made of 15% of the ratings.
trainset, testset = train_test_split(data, test_size=.15)

algos = [(SVD(),"SVD"),(NMF(),"NMF")]

for algo,name in algos:
    algo.fit(trainset)
    predictions = algo.test(testset)
    
    print(name,accuracy.rmse(predictions))

RMSE: 1.0546
SVD 1.054635216935152
RMSE: 1.2316
NMF 1.2316333193779874


## Test

In [16]:

def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n




In [None]:
trainset = data.build_full_trainset()
algo = SVD()
algo.fit(trainset)

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

top_n = get_top_n(predictions, n=10)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])
