# Matrix factorization

## Loading the data

In [30]:
import numpy as np
import pandas as pd
import re
from surprise import SVD,NMF,SVDpp
from surprise import Dataset
from surprise import Reader
from surprise import NormalPredictor



from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise import accuracy




In [31]:
DATASET_PATH = "../Data2.csv"
COLUMNS = ["id_review", "rating", "review_title", "review_text", "user_pseudo", "user_location", "hotel_id", "date_stayed", "date_review"]

In [32]:
RE = re.compile(r"^(\d+);;(\d.\d);;“(.*)”;;(.*);;(.*);;(.*);;(\d*);;(.*);;(.*)$")


In [33]:
with open(DATASET_PATH, 'r', encoding="utf8") as f:
    data = []
    broken_lines = 0
    for line in f:
        match = RE.fullmatch(line.strip())
        if match:
            fields = match.groups()

            # if the correct number of fields where found
            if len(fields) == len(COLUMNS):
                data.append({column: field for column, field in zip(COLUMNS, fields)})
            else:
                broken_lines += 1
                #print("Not the correct number of match on this line:")
                #print(line)
        else:
            broken_lines += 1
            #print("No match on this line:")
            #print(line)
            
    
  # print(f"{len(data)} correctly parsed lines, {broken_lines} incorrectly parsed lines")
#df = pd.read_csv(DATASET_ARCHIVE_PATH, sep='\;\;', names=COLUMNS, header=None, error_bad_lines=False)
#clear_output()
df = pd.DataFrame.from_records(data)
print(f"Chargement des données fini, {len(data)} correctly parsed lines, {broken_lines} incorrectly parsed lines")


Chargement des données fini, 878554 correctly parsed lines, 14 incorrectly parsed lines


In [34]:
df = df.dropna(subset=['rating','hotel_id'])
df.sample()

Unnamed: 0,id_review,rating,review_title,review_text,user_pseudo,user_location,hotel_id,date_stayed,date_review
684790,87788597,2.0,Do not stay here!!!!,My husband and I stayed here for the weekend t...,Angie2003,"Charleston, West Virginia",94171,November 2010,"November 21, 2010"


## Encoding user

In [35]:
df['user_pseudo'] = df['user_pseudo'].fillna("Anon")

In [36]:
from sklearn import preprocessing

le_user  = preprocessing.LabelEncoder()
le_user.fit(df['user_pseudo'].values)

le_user.transform(["Anon"])

array([19989])

In [37]:
df['user_pseudo'] = le_user.transform(df['user_pseudo'].values)
df['user_pseudo']  = df['user_pseudo'].astype(int)

In [38]:
USER_COUNT = len(le_user.classes_)

In [39]:
df.sample()

Unnamed: 0,id_review,rating,review_title,review_text,user_pseudo,user_location,hotel_id,date_stayed,date_review
322362,122024343,5.0,Great hotel; excellent service,We did a two-night weekend getaway to celebrat...,252441,"Milwaukee, Wisconsin",293203,December 2011,"December 23, 2011"


# Encoding hotel

In [40]:
le_hotel =  preprocessing.LabelEncoder()
le_hotel.fit(df['hotel_id'].values)


LabelEncoder()

In [41]:
df['hotel_id'] = le_hotel.transform(df['hotel_id'].values)
df['hotel_id'] = df['hotel_id'].astype(int) 

In [42]:
HOTEL_COUNT = len(le_hotel.classes_)


## Matrix factorization Algorithms


In [14]:



rating_df = pd.DataFrame()
rating_df["rating"] = df['rating']
rating_df["user_pseudo"] = df["user_pseudo"]
rating_df["hotel_id"] = df["hotel_id"]

rating_df.sample(30)

Unnamed: 0,rating,user_pseudo,hotel_id
392836,5.0,JenBenG,224279
474506,1.0,angelkfire,108158
210010,5.0,Lovetoroadtrip82,93507
495283,4.0,scotsroadwarrior,498304
280057,3.0,BRH949,87618
337464,4.0,Hannah F,77852
770007,5.0,annasmom88,225873
678279,5.0,roadie80,100582
739839,5.0,dbtraveler58,123022
116428,2.0,LKou,1214829


In [15]:
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(0, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[['user_pseudo', 'hotel_id', 'rating']], reader)


 ## Intrinsec measure

In [16]:
# sample random trainset and testset
# test set is made of 15% of the ratings.
trainset, testset = train_test_split(data, test_size=.15)

algos = [(SVD(),"SVD"),(NMF(),"NMF")]

for algo,name in algos:
    algo.fit(trainset)
    predictions = algo.test(testset)
    print(name,"Test",accuracy.rmse(predictions))
    predictions = algo.test(trainset.build_testset())
    print(name,"Train",accuracy.rmse(predictions))

RMSE: 1.0551
SVD Test 1.0551149350601863
RMSE: 0.7276
SVD Train 0.7275992564041589
RMSE: 1.2278
NMF Test 1.227826425916374
RMSE: 0.3931
NMF Train 0.3931116842867549


## Model creation

In [17]:
from joblib import dump, load

In [18]:
trainset = data.build_full_trainset()
algo = SVD()
algo.fit(trainset)
algo.predict(19989,2)

Prediction(uid=19989, iid=2, r_ui=None, est=3.9362281658270293, details={'was_impossible': False})

In [19]:
dump(algo, 'svd.joblib') 

['svd.joblib']

In [43]:
dump(le_hotel,'le_hotel.joblib')

['le_hotel.joblib']

In [44]:
dump(le_user,'le_user.joblib')

['le_user.joblib']