# Preprocessing Data

Import Library

In [1]:
import pandas as pd
import numpy as np
import pickle
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from datetime import datetime
from sortedcontainers import SortedList

Read Csv File dan cek apakah file nya sudah terbaca

In [2]:
url = 'https://drive.google.com/file/d/1rOPR2Xg4g5P4jxKejzZ1mx20L2Pwb4Ln/view?usp=sharing'
file_id = url.split('/')[-2]
dwn_url = 'https://drive.google.com/uc?id=' + file_id
df = pd.read_csv(dwn_url, sep=",")
df.head()

Unnamed: 0,user_id,place_id,rating
0,1,2,3
1,1,7,5
2,1,8,5
3,1,12,2
4,1,13,5


Cek apakah terdapat nilai kosong pada kolom

In [3]:
df.isna().sum()

user_id     0
place_id    0
rating      0
dtype: int64

Melihat informasi df

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   user_id   3000 non-null   int64
 1   place_id  3000 non-null   int64
 2   rating    3000 non-null   int64
dtypes: int64(3)
memory usage: 70.4 KB


userId dari 1...3000, convert jadi userId dari 0...2999

In [5]:
userID = df.user_id.values
print(userID.min())
placeID = df.place_id.values
print(placeID.min())

1
1


Cek apakah terdapat Id yang tidak muncul dalam urutan

In [6]:
def missingIdNum(data):
    return [i for i in range(1,len(data)) if data[i-1]!=i]

place = np.sort(list(set(df.place_id.values)))
user = np.sort(list(set(df.user_id.values)))

print('ID place tidak ada yang hilang? {}'.format(len(missingIdNum(place))==0))
print('ID user tidak ada yang hilang? {}'.format(len(missingIdNum(user))==0))

ID place tidak ada yang hilang? True
ID user tidak ada yang hilang? True


Membuat mapping untuk placeId

In [7]:
unique_place_ids = set(df.place_id.values) # convert to array -> set (untuk menghapus data duplikat)
place2idx = {}
count = 0
for place_id in unique_place_ids:
    place2idx[place_id] = count
    count+=1
df['place_idx'] = df.apply(lambda row: place2idx[row.place_id], axis=1)

unique_user_ids = set(df.user_id.values) # convert to array -> set (untuk menghapus data duplikat)
user2idx = {}
count = 0
for user_id in unique_user_ids:
    user2idx[user_id] = count
    count+=1
df['user_idx'] = df.apply(lambda row: user2idx[row.user_id], axis=1)

Drop kolom placeName dan placeId, lalu swap kolom place_idx dengan rating

In [8]:
df = df.drop(columns=['place_id','user_id'])

In [9]:
cols = list(df.columns)
a, b = cols.index('rating'), cols.index('user_idx')
cols[a], cols[b] = cols[b], cols[a]
df = df[cols]
df

Unnamed: 0,user_idx,place_idx,rating
0,0,1,3
1,0,6,5
2,0,7,5
3,0,11,2
4,0,12,5
...,...,...,...
2995,99,57,1
2996,99,58,5
2997,99,64,4
2998,99,68,5


In [10]:
df.columns = ['userID', 'placeID', 'rating']

df.head()

Unnamed: 0,userID,placeID,rating
0,0,1,3
1,0,6,5
2,0,7,5
3,0,11,2
4,0,12,5


In [11]:
print('Original dataset size:', len(df))

Original dataset size: 3000


In [12]:
N = df.userID.max() + 1 # Banyaknya user
M = df.placeID.max() + 1 # Banyaknya tempat wisata
print("Banyak user:", N)
print("Banyak tempat wisata:", M)

# Menghitung banyak nya suatu user disebutkan dalam kolom userID -> semakin banyak, maka user tsb sering merating tempat wisata
user_ids_count = Counter(df.userID)
# Menghitung banyak nya suatu tempat wisata disebutkan dalam kolom placeID -> semakin banyak, maka tempat wisata tsb sering dirating oleh users
place_ids_count = Counter(df.placeID)
# Bentuknya menjadi tuple {key:value}

Banyak user: 100
Banyak tempat wisata: 75


In [13]:
# Banyaknya user dan tempat yang akan disimpan (digunakan untuk small DS)
n = 75
m = 75

user_ids = [u for u, c in user_ids_count.most_common(n)]
place_ids = [m for m, c in place_ids_count.most_common(m)]
# u,m: key ; c: value

In [14]:
'''
Membuat small df
- userID dan placeID digabungkan menggunakan bitwise operator &
- row diambil jika user di user_ids <=> merating tempat wisata di place_ids
'''
df_small = df[df.userID.isin(user_ids) & df.placeID.isin(place_ids)].copy()
df_small

Unnamed: 0,userID,placeID,rating
0,0,1,3
1,0,6,5
2,0,7,5
3,0,11,2
4,0,12,5
...,...,...,...
2245,74,61,5
2246,74,62,3
2247,74,66,3
2248,74,68,2


Bikin jadi sequential id nya

In [15]:
unique_user_ids = set(df_small.userID.values)
new_user_id_map = {}
i = 0
for old in unique_user_ids:
    new_user_id_map[old] = i
    i+=1
print("Banyak user:", i)

unique_place_ids = set(df_small.placeID.values)
new_place_id_map = {}
j = 0
for old in unique_place_ids:
    new_place_id_map[old] = j
    j+=1
print("Banyak tempat wisata:",j)

df_small.loc[:,'userID'] = df_small.apply(lambda row: new_user_id_map[row.userID], axis=1)
df_small.loc[:,'placeID'] = df_small.apply(lambda row: new_place_id_map[row.placeID], axis=1)

Banyak user: 75
Banyak tempat wisata: 75


In [16]:
df_small

Unnamed: 0,userID,placeID,rating
0,0,1,3
1,0,6,5
2,0,7,5
3,0,11,2
4,0,12,5
...,...,...,...
2245,74,61,5
2246,74,62,3
2247,74,66,3
2248,74,68,2


In [17]:
print("max user id:", df_small.userID.max())
print("max tempat wisata id:", df_small.placeID.max())
print("small df size:", len(df_small))
df_small

max user id: 74
max tempat wisata id: 74
small df size: 2250


Unnamed: 0,userID,placeID,rating
0,0,1,3
1,0,6,5
2,0,7,5
3,0,11,2
4,0,12,5
...,...,...,...
2245,74,61,5
2246,74,62,3
2247,74,66,3
2248,74,68,2


In [18]:
N = df_small.userID.max()+1
M = df_small.placeID.max()+1

count = 0
for i in df_small.rating:
    if i==0 or i =='0':
        count+=1
print(count)
print("Banyak user:", N)
print("Banyak tempat wisata:", M)

0
Banyak user: 75
Banyak tempat wisata: 75


In [19]:
df_small = shuffle(df_small)
df_small.reset_index(inplace=True, drop=True)
df_small

Unnamed: 0,userID,placeID,rating
0,22,59,5
1,71,15,5
2,4,16,5
3,64,62,4
4,60,37,4
...,...,...,...
2245,12,69,4
2246,51,24,5
2247,1,30,5
2248,47,42,4


Split into data train and data test


In [20]:
# 80% menjadi df train, 20% df test
cutoff = int(0.7*len(df_small))
df_train = df_small.iloc[:cutoff]
df_test = df_small.iloc[cutoff:]

In [21]:
df_train.sort_values(by=['userID'], inplace=True)
df_test.sort_values(by=['userID'], inplace=True)

print(df_train.shape)
print(df_test.shape)

print(df_train)
print(df_test)

(1575, 3)
(675, 3)
      userID  placeID  rating
355        0       47       3
1350       0       38       3
589        0       67       5
1295       0       40       5
1393       0       30       5
...      ...      ...     ...
446       74        9       4
1197      74       62       3
1544      74       49       2
13        74        0       1
1258      74       25       5

[1575 rows x 3 columns]
      userID  placeID  rating
1614       0       51       2
2179       0       49       5
2003       0       44       4
2071       0       50       5
1743       0        7       5
...      ...      ...     ...
1721      74       24       5
1981      74       48       2
1646      74       16       1
2173      74       68       2
2027      74        3       4

[675 rows x 3 columns]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train.sort_values(by=['userID'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.sort_values(by=['userID'], inplace=True)


# DATASET INFO

In [22]:
df_small.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2250 entries, 0 to 2249
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   userID   2250 non-null   int64
 1   placeID  2250 non-null   int64
 2   rating   2250 non-null   int64
dtypes: int64(3)
memory usage: 52.9 KB


# Final Project

Import Library

In [23]:
import numpy as np
import configparser
import math
import warnings
from sklearn.metrics import mean_squared_error
import pandas as ps

# GROUP CLASS

Kelas 'Grup' bertanggung jawab untuk menghasilkan grup acak dengan ukuran berbeda: kecil, sedang, dan besar dan melakukan evaluasi terhadap berbagai metode AF, BF, dan WBF yang digunakan untuk merekomendasikan grup ini.

In [24]:
class Group:
    def __init__(self, members, candidate_items, ratings):
        # member ids
        self.members = sorted(members)

        # List of items yang dapat direkomendasikan.
        # List ini seharusnya belum didatangi oleh anggota grup mana pun
        self.candidate_items = candidate_items # menyimpan daftar item yang dapat direkomendasikan

        self.actual_recos = [] # menyimpan rekomendasi aktual yang dibuat atau diprediksi 
        self.false_positive = [] # Atribut ini menginisialisasi suatu daftar kosong, digunakan untuk menyimpan item yang keliru dianggap sebagai rekomendasi positif
        
        # Atribut ini menghitung jumlah peringkat yang diberikan oleh setiap anggota dalam suatu kelompok atau tim
        # menghitung jumlah peringkat yang tidak nol untuk setiap anggota
        self.ratings_per_member = [np.size(ratings[member].nonzero()) for member in self.members]
        

        # AF
        self.grp_factors_af = []
        self.bias_af = 0
        self.precision_af = 0
        self.recall_af = 0
        self.reco_list_af = []

        # BF
        self.grp_factors_bf = []
        self.bias_bf = 0
        self.precision_bf = 0
        self.recall_bf = 0
        self.reco_list_bf = []

        # WBF
        self.grp_factors_wbf = []
        self.bias_wbf = 0
        self.precision_wbf = 0
        self.recall_wbf = 0
        self.weight_matrix_wbf = []
        self.reco_list_wbf = []


Kelas 'Config' berfungsi sebagai pembaca konfigurasi yang bertanggung jawab membaca parameter konfigurasi dan menangani pengaturan konfigurasi untuk sistem rekomendasi.

In [25]:
#Configuration reader.
class Config:
    def __init__(self, config_file_path):
        self.config_file_path = config_file_path

        configParser = configparser.RawConfigParser()
        configParser.read(config_file_path)

        # Bali tourism dataset, 80 - 20 train/test ratio, present in data directory
        url_trainingData = 'https://drive.google.com/file/d/1-1canNjW1tvxFBfwivedp1O3WRT0vBQB/view?usp=sharing'
        url_testData = 'https://drive.google.com/file/d/1--4dU905tmg_rc9AagfoqxVyV2K4UCsq/view?usp=sharing'

        fileTrain_id = url_trainingData.split('/')[-2]
        dwnTrain_url = 'https://drive.google.com/uc?id=' + fileTrain_id
        fileTest_id = url_testData.split('/')[-2]
        dwnTest_url = 'https://drive.google.com/uc?id=' + fileTest_id

        self.training_file = ps.read_csv(dwnTrain_url)
        self.testing_file = ps.read_csv(dwnTest_url)

        self.small_grp_size = int(configParser.get('Config', 'small_grp_size'))
        self.medium_grp_size = int(configParser.get('Config', 'medium_grp_size'))
        self.large_grp_size = int(configParser.get('Config', 'large_grp_size'))

        self.max_iterations_mf = int(configParser.get('Config', 'max_iterations_mf'))
        self.lambda_mf = float(configParser.get('Config', 'lambda_mf'))
        self.learning_rate_mf = float(configParser.get('Config', 'learning_rate_mf'))

        self.num_factors = int(configParser.get('Config', 'num_factors'))

        #AF (after factorization)
        self.rating_threshold_af = float(configParser.get('Config', 'rating_threshold_af'))
        self.num_recos_af = int(configParser.get('Config', 'num_recos_af'))

        #BF (before factorization)
        self.rating_threshold_bf = float(configParser.get('Config', 'rating_threshold_bf'))
        self.num_recos_bf = int(configParser.get('Config', 'num_recos_bf'))

        #WBF (weighted before factorization)
        self.rating_threshold_wbf = float(configParser.get('Config', 'rating_threshold_wbf'))
        self.num_recos_wbf = int(configParser.get('Config', 'num_recos_wbf'))

        self.is_debug = configParser.getboolean('Config', 'is_debug')

# Membuat daftar tempat wisata yang dapat direkomendasikan :


Fungsi di sini memiliki tujuan untuk mencari item yang belum didatangi oleh semua anggota dalam suatu kelompok. Fungsi ini dapat digunakan untuk menyaring item yang dapat direkomendasikan kepada kelompok tersebut

In [26]:
@staticmethod
def find_candidate_items(ratings, members):
    if len(members) == 0: return [] # pengecekan apakah ada members kosong

    # Membuat daftar indeks item yang belum didatangi oleh anggota pertama dalam daftar anggota 
    unwatched_items = np.argwhere(ratings[members[0]] == 0) 

    # Fungsi kemudian melakukan iterasi melalui setiap anggota selain anggota pertama
    for member in members:
        # daftar indeks item yang belum didatangi oleh anggota saat ini
        cur_unwatched = np.argwhere(ratings[member] == 0)
        # mengupdate dengan irisan (intersection) dari unwatched_items dan cur_unwatched
        unwatched_items = np.intersect1d(unwatched_items, cur_unwatched)

    # Fungsi mengembalikan daftar indeks item yang belum ditonton oleh semua anggota dalam kelompok.
    return unwatched_items

# fungsi ini ditambahkan sebagai metode pada kelas Group
Group.find_candidate_items = find_candidate_items
# Mencari item yang dapat direkomendasikan kepada anggota kelompok tersebut

Fungsi di sini memiliki tujuan untuk mencari item yang belum dievaluasi (belum mendapatkan rating) oleh semua anggota dalam suatu kelompok. Fungsi ini dapat digunakan untuk menyaring item yang belum mendapatkan rating dari anggota kelompok tersebut.

In [27]:
@staticmethod
def non_testable_items(members, ratings):
    # Menginisialisasi suatu daftar indeks yang sesuai dengan item yang belum diuji oleh anggota pertama dalam grup
    # untuk mencari indeks item yang belum mendapatkan rating (nilai nol) oleh anggota pertama dalam daftar anggota
    non_eval_items = np.argwhere(ratings[members[0]] == 0) 
    
    # melakukan iterasi melalui setiap anggota selain anggota pertama
    for member in members:

        # Mencari item yang belum dievaluasi oleh anggota tersebut
        cur_non_eval_items = np.argwhere(ratings[member] == 0)

        #mengambil irisan dengan indeks item yang belum diuji oleh anggota pertama dan anggota saat ini.
        non_eval_items = np.intersect1d(non_eval_items, cur_non_eval_items) 

    # mengembalikan daftar indeks item yang belum dievaluasi oleh semua anggota dalam kelompok
    return non_eval_items

Group.non_testable_items = non_testable_items
# mencari item yang belum dievaluasi oleh anggota kelompok tersebut.

# GENERATING GROUPS

Membuat grup dari pengguna yang tersedia. Untuk evaluasi yang lebih baik dari pendekatan rekomendasi, harus memastikan bahwa ada cukup item untuk diuji. Jadi menetapkan testable_threshold menjadi 10, yang pada dasarnya berarti bahwa setidaknya ada 10 tempat wisata dalam dataset pengujian yang telah diberi rating oleh setidaknya satu anggota grup.

Fungsi yang didefinisikan di sini memiliki tujuan untuk menghasilkan sejumlah kelompok dengan anggota yang dipilih secara acak, dan memastikan bahwa setiap kelompok memiliki jumlah item yang dapat diuji (testable items) yang memenuhi batas tertentu (testable_threshold).

In [28]:
@staticmethod
def generate_groups(cfg, ratings, test_ratings, num_users, count, size, disjoint = True):
    avbl_users = [i for i in range(num_users)] # Daftar pengguna yang tersedia, awalnya berisi semua indeks pengguna
    groups = [] #  daftar yang akan berisi kelompok-kelompok yang dihasilkan
    testable_threshold = 10 #Batas minimum jumlah item yang dapat diuji dalam suatu grup

    iter_idx = 0
    while iter_idx in range(count):
        group_members = np.random.choice(avbl_users, size = size, replace = False) # Memilih anggota-anggota acak dari pengguna yang tersedia
        candidate_items = Group.find_candidate_items(ratings, group_members) # mendapatkan item-item yang dapat direkomendasikan kepada grup
        non_eval_items = Group.non_testable_items(group_members, test_ratings) # mendapatkan item-item yang tidak dapat dievaluasi oleh grup
        testable_items = np.setdiff1d(candidate_items, non_eval_items) # Menghitung item-item yang dapat diuji dengan mengambil perbedaan dari candidate_items, non_eval_items
        # Fungsi mencari item yang dapat diuji dengan mengambil perbedaan antara item yang dapat direkomendasikan dan item yang belum dapat diuji

        # Fungsi memeriksa apakah ada item yang dapat direkomendasikan dan apakah jumlah item yang dapat diuji memenuhi batas testable_threshold.
        if len(candidate_items) != 0 and len(testable_items) >= testable_threshold:
            groups += [Group(group_members, candidate_items, ratings)] # membuat kelompok baru dengan anggota dan item yang sesuai 
            avbl_users = np.setdiff1d(avbl_users, group_members) #  anggota kelompok tersebut dihapus dari daftar pengguna yang tersedia
            iter_idx += 1

    # mengembalikan daftar kelompok yang telah dibuat.
    return groups

Group.generate_groups = generate_groups

# PREDICTION
Sekarang grup telah dibentuk, ini adalah metode untuk memprediksi tempat wisata!
Threshold untuk rating yang diprediksi untuk suatu item menjadi 4.

Fungsi yang didefinisikan di sini memiliki tujuan untuk menghasilkan rekomendasi aktual (actual recommendations) dan mengidentifikasi item-item yang disarankan tetapi seharusnya tidak disarankan (false positive). 

In [29]:
def generate_actual_recommendations(self, ratings, threshold):
    # Mendapatkan item-item yang tidak dapat dievaluasi oleh anggota grup menggunakan metode non_testable_items dari kelas Group
    non_eval_items = Group.non_testable_items(self.members, ratings) 

    # Inisialisasi dengan indeks item yang direkomendasikan oleh anggota pertama dalam grup atau belum dinilai (rating 0)
    items = np.argwhere(np.logical_or(ratings[self.members[0]] >= threshold, ratings[self.members[0]] == 0)).flatten() 

    #  Inisialisasi dengan indeks item yang tidak direkomendasikan atau memiliki peringkat di antara 0 dan ambang batas oleh anggota pertama
    fp = np.argwhere(np.logical_and(ratings[self.members[0]] > 0, ratings[self.members[0]] < threshold)).flatten() 

    for member in self.members:
        # Mendapatkan item yang direkomendasikan oleh anggota saat ini atau belum dinilai (rating 0).
        cur_items = np.argwhere(np.logical_or(ratings[member] >= threshold, ratings[member] == 0)).flatten()
        # Menggabungkan indeks item yang tidak direkomendasikan atau memiliki peringkat di antara 0 dan ambang batas oleh anggota saat ini dengan yang telah ditemukan sebelumnya (fp)
        fp = np.union1d(fp, np.argwhere(np.logical_and(ratings[member] > 0, ratings[member] < threshold)).flatten())
        # Mengambil irisan dari indeks item yang direkomendasikan oleh anggota pertama dan anggota saat ini.
        items = np.intersect1d(items, cur_items) 

    #Menghapus item-item yang tidak dapat dievaluasi dari daftar item yang direkomendasikan.
    items = np.setdiff1d(items, non_eval_items) 

    # Menetapkan daftar item yang sebenarnya direkomendasikan oleh anggota grup ke dalam atribut actual_recos dari objek kelas
    self.actual_recos = items
    # Menetapkan daftar item yang salah direkomendasikan 
    self.false_positive = fp 

Group.generate_actual_recommendations  = generate_actual_recommendations


# EVALUATION

Tiga fungsi berikut digunakan untuk evaluasi masing-masing dari ketiga metode AF, BF, dan WBF.
Metode evaluasi yang digunakan adalah Precision dan Recall untuk berbagai ukuran grup.

In [66]:
def evaluate_af(self, is_debug=False):
    tp = float(np.intersect1d(self.actual_recos, self.reco_list_af).size)
    fp = float(np.intersect1d(self.false_positive, self.reco_list_af).size)

    try:
        self.precision_af = tp / (tp + fp)
    except ZeroDivisionError:
        self.precision_af = np.NaN

    try:
        self.recall_af = tp / self.actual_recos.size
    except ZeroDivisionError:
        self.recall_af = np.NaN

    if is_debug:
        print('tp: ', tp)
        print('fp: ', fp)
        print('precision_af: ', self.precision_af)
        print('recall_af: ', self.recall_af)

    return self.precision_af, self.recall_af, tp, fp
Group.evaluate_af = evaluate_af

In [81]:
def evaluate_bf(self, is_debug=False):
    tp = float(np.intersect1d(self.actual_recos, self.reco_list_bf).size)
    fp = float(np.intersect1d(self.false_positive, self.reco_list_bf).size)

    try:
        self.precision_bf = tp / (tp + fp)
    except ZeroDivisionError:
        self.precision_bf = np.NaN

    try:
        self.recall_bf = tp / self.actual_recos.size
    except ZeroDivisionError:
        self.recall_bf = np.NaN

    if is_debug:
        print('tp: ', tp)
        print('fp: ', fp)
        print('precision_bf: ', self.precision_bf)
        print('recall_bf: ', self.recall_bf)

    return self.precision_bf, self.recall_bf, tp, fp
Group.evaluate_bf = evaluate_bf

In [82]:
def evaluate_wbf(self, is_debug=False):
    tp = float(np.intersect1d(self.actual_recos, self.reco_list_wbf).size)
    fp = float(np.intersect1d(self.false_positive, self.reco_list_wbf).size)

    try:
        self.precision_wbf = tp / (tp + fp)
    except ZeroDivisionError:
        self.precision_wbf = np.NaN

    try:
        self.recall_wbf = tp / self.actual_recos.size
    except ZeroDivisionError:
        self.recall_wbf = np.NaN

    if is_debug:
        print('tp: ', tp)
        print('fp: ', fp)
        print('precision_bf: ', self.precision_wbf)
        print('recall_bf: ', self.recall_wbf)

    return self.precision_wbf, self.recall_wbf, tp, fp
Group.evaluate_wbf = evaluate_wbf

# Aggregator Class :

Kelas ini bertanggung jawab untuk menentukan cara yang berbeda untuk mengagregasi faktor untuk anggota grup.

In [45]:
class Aggregators:
    def __init__(self):
        pass

    #pass ratings or factors as input
    @staticmethod
    def average(arr): # menghitung rata-rata dari matriks input
        return np.average(arr, axis = 0, weights = None)

    @staticmethod
    def average_bf(arr): # menghitung rata-rata dari matriks input, tetapi mengabaikan nilai nol (0) dalam perhitungannya
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=RuntimeWarning)
            arr[arr == 0] = np.nan
            return np.nanmean(arr, axis=0)

    @staticmethod
    def weighted_average(arr, weights): # Menghitung rata-rata terbobot dari matriks input menggunakan bobot yang diberikan.
        return np.average(arr, axis = 0, weights = weights)


# GroupRec Class :

Ini adalah class utama yang bertanggung jawab untuk membaca data, menentukan metode untuk pendekatan, dan mengevaluasinya.

In [46]:
# overflow warnings should be raised as errors
np.seterr(over='raise')

class GroupRec:
    def __init__(self):
        self.cfg = Config(r"config.conf")

        # training and testing matrices
        self.ratings = None
        self.test_ratings = None

        self.groups = []

        # read data into above matrices
        self.read_data()

        self.num_users = self.ratings.shape[0]
        self.num_items = self.ratings.shape[1]

        # predicted ratings matrix based on factors.
        self.predictions = np.zeros((self.num_users, self.num_items))

        # output after svd factorization
        # initialize all unknowns with random values from -1 to 1
        self.user_factors = np.random.uniform(-1, 1, (self.ratings.shape[0], self.cfg.num_factors))
        self.item_factors = np.random.uniform(-1, 1, (self.ratings.shape[1], self.cfg.num_factors))

        self.user_biases = np.zeros(self.num_users)
        self.item_biases = np.zeros(self.num_items)

        # global mean of ratings a.k.a mu
        self.ratings_global_mean = 0

    # add list of groups
    def add_groups(self, groups):
        self.groups = groups

    # remove groups
    def remove_groups(self, groups):
        self.groups = []

# Read Data

Menggunakan library 'pandas' untuk membaca data test dan data train dari file csv. Setelah itu membuat matriks rating item * pengguna di sini.

In [47]:
# read training and testing data into matrices
def read_data(self):
  column_headers = ['userID', 'placeID', 'rating']

  url_trainingData = r'https://drive.google.com/file/d/1-1canNjW1tvxFBfwivedp1O3WRT0vBQB/view?usp=sharing'
  url_testData = r'https://drive.google.com/file/d/1--4dU905tmg_rc9AagfoqxVyV2K4UCsq/view?usp=sharing'


  fileTrain_id = url_trainingData.split('/')[-2]
  dwnTrain_url = 'https://drive.google.com/uc?id=' + fileTrain_id
  print('Reading training data from...')
  training_data = ps.read_csv(dwnTrain_url)

  fileTest_id = url_testData.split('/')[-2]
  dwnTest_url = 'https://drive.google.com/uc?id=' + fileTest_id
  print('Reading testing data...')
  testing_data = ps.read_csv(dwnTest_url)

  num_users = max(training_data.userID.unique())
  num_items = max(training_data.placeID.unique())

  self.ratings = np.zeros((num_users, num_items))
  self.test_ratings = np.zeros((num_users, num_items))


  for row in training_data.itertuples(index=False):
      self.ratings[row.userID - 1, row.placeID - 1] = row.rating

  for row in testing_data.itertuples(index=False):
      self.test_ratings[row.userID - 1, row.placeID - 1] = row.rating

GroupRec.read_data = read_data

# Matrix Factorization :

Memfaktorkan matriks rating. Menggunakan gradient descent untuk meminimalkan kesalahan.

In [48]:
def sgd_factorize(self):
    #solve for these for matrix ratings
    ratings_row, ratings_col = self.ratings.nonzero()
    num_ratings = len(ratings_row)
    learning_rate = self.cfg.learning_rate_mf
    regularization = self.cfg.lambda_mf

    self.ratings_global_mean = np.mean(self.ratings[np.where(self.ratings != 0)])

    print('Doing matrix factorization...')
    try:
        for iter in range(self.cfg.max_iterations_mf):
            print('Iteration: ', iter)
            rating_indices = np.arange(num_ratings)
            np.random.shuffle(rating_indices)

            for idx in rating_indices:
                user = ratings_row[idx]
                item = ratings_col[idx]

                pred = self.predict_user_rating(user, item)
                error = self.ratings[user][item] - pred

                self.user_factors[user] += learning_rate \
                                            * ((error * self.item_factors[item]) - (regularization * self.user_factors[user]))
                self.item_factors[item] += learning_rate \
                                            * ((error * self.user_factors[user]) - (regularization * self.item_factors[item]))

                self.user_biases[user] += learning_rate * (error - regularization * self.user_biases[user])
                self.item_biases[item] += learning_rate * (error - regularization * self.item_biases[item])

            self.sgd_mse()

    except FloatingPointError:
        print('Floating point Error: ')
GroupRec.sgd_factorize = sgd_factorize


def sgd_mse(self):
    self.predict_all_ratings() # Membuat prediksi untuk semua peringkat 

    # Mengambil peringkat yang telah diprediksi dan rating aktual dari data pelatihan
    predicted_training_ratings = self.predictions[self.ratings.nonzero()].flatten() 
    # Mengambil peringkat yang telah diprediksi dan rating aktual dari data pengujian.
    actual_training_ratings = self.ratings[self.ratings.nonzero()].flatten() 

    predicted_test_ratings = self.predictions[self.test_ratings.nonzero()].flatten()
    actual_test_ratings = self.test_ratings[self.test_ratings.nonzero()].flatten()

    training_mse = mean_squared_error(predicted_training_ratings, actual_training_ratings)
    print('training mse: ', training_mse)
    test_mse = mean_squared_error(predicted_test_ratings, actual_test_ratings)
    print('test mse: ', test_mse)
GroupRec.sgd_mse = sgd_mse


def predict_user_rating(self, user, item): # menghitung prediksi peringkat untuk suatu pengguna pada suatu item
    prediction = self.ratings_global_mean + self.user_biases[user] + self.item_biases[item]
    prediction += self.user_factors[user, :].dot(self.item_factors[item, :].T)
    return prediction
GroupRec.predict_user_rating = predict_user_rating

def predict_group_rating(self, group, item, method): 
    if (method == 'af'):
        factors = group.grp_factors_af; bias_group = group.bias_af
    elif (method == 'bf'):
        factors = group.grp_factors_bf; bias_group = group.bias_bf
    elif (method == 'wbf'):
        factors = group.grp_factors_wbf; bias_group = group.bias_wbf

    return self.ratings_global_mean + bias_group + self.item_biases[item] \
                                    + np.dot(factors.T, self.item_factors[item])
GroupRec.predict_group_rating = predict_group_rating

def predict_all_ratings(self): # menghitung prediksi peringkat untuk suatu grup pada suatu item
    for user in range(self.num_users):
        for item in range(self.num_items):
            self.predictions[user, item] = self.predict_user_rating(user, item)
GroupRec.predict_all_ratings = predict_all_ratings


After Factorization (AF) Method Definition.....

In [49]:
#AF method
def af_runner(self, groups = None, aggregator = Aggregators.average):
    #if groups is not passed, use self.groups
    if (groups is None):
        groups = self.groups

    #calculate factors
    for group in groups:
        member_factors = self.user_factors[group.members, :]
        member_biases = self.user_biases[group.members]

        #aggregate the factors
        if (aggregator == Aggregators.average):
            group.grp_factors_af = aggregator(member_factors)
            group.bias_af = aggregator(member_biases)
        elif (aggregator == Aggregators.weighted_average):
            group.grp_factors_af = aggregator(member_factors, weights = group.ratings_per_member)
            group.bias_af = aggregator(member_biases, weights = group.ratings_per_member)

        #predict ratings for all candidate items
        group_candidate_ratings = {}
        for idx, item in enumerate(group.candidate_items):
            cur_rating = self.predict_group_rating(group, item, 'af')

            if (cur_rating > self.cfg.rating_threshold_af):
                group_candidate_ratings[item] = cur_rating

        #sort and filter to keep top 'num_recos_af' recommendations
        group_candidate_ratings = sorted(group_candidate_ratings.items(), key=lambda x: x[1], reverse=True)[:self.cfg.num_recos_af]

        group.reco_list_af = np.array([rating_tuple[0] for rating_tuple in group_candidate_ratings])

GroupRec.af_runner = af_runner

Before Factorization(BF) Method.....

In [50]:
def bf_runner(self, groups=None, aggregator=Aggregators.average_bf):
    
    # aggregate user ratings into virtual group
    # calculate factors of group
    lamb = self.cfg.lambda_mf

    for group in groups:
        all_attractions = np.arange(len(self.ratings.T))
        watched_items = sorted(list(set(all_attractions) - set(group.candidate_items)))

        group_rating = self.ratings[group.members, :]
        agg_rating = aggregator(group_rating)
        s_g = []
        for j in watched_items:
            s_g.append(agg_rating[j] - self.ratings_global_mean - self.item_biases[j])

        # creating matrix A : contains rows of [item_factors of items in watched_list + '1' vector]
        A = np.zeros((0, self.cfg.num_factors))

        for item in watched_items:
            A = np.vstack([A, self.item_factors[item]])
        v = np.ones((len(watched_items), 1))
        A = np.c_[A, v]

        factor_n_bias = np.dot(np.linalg.inv(np.dot(A.T, A) + lamb * np.identity(self.cfg.num_factors + 1)), np.dot(A.T, s_g))
        group.grp_factors_bf = factor_n_bias[:-1]
        group.bias_bf = factor_n_bias[-1]

        # Making recommendations on candidate list :
        group_candidate_ratings = {}
        for idx, item in enumerate(group.candidate_items):
            cur_rating = self.predict_group_rating(group, item, 'bf')

            if (cur_rating > self.cfg.rating_threshold_bf):
                group_candidate_ratings[item] = cur_rating

        # sort and filter to keep top 'num_recos_bf' recommendations
        group_candidate_ratings = sorted(group_candidate_ratings.items(), key=lambda x: x[1], reverse=True)[
                                  :self.cfg.num_recos_bf]

        group.reco_list_bf = np.array([rating_tuple[0] for rating_tuple in group_candidate_ratings])

GroupRec.bf_runner = bf_runner

Weighted Before Factorization Method (WBF).....

In [80]:
def wbf_runner(self, groups=None, aggregator=Aggregators.average_bf):
    # aggregate user ratings into virtual group
    # calculate factors of group
    lamb = self.cfg.lambda_mf
    for group in groups:
        all_attractions = np.arange(len(self.ratings.T))
        watched_items = sorted(list(set(all_attractions) - set(group.candidate_items)))

        group_rating = self.ratings[group.members, :]
        agg_rating = aggregator(group_rating)
        s_g = []
        for j in watched_items:
            s_g.append(agg_rating[j] - self.ratings_global_mean - self.item_biases[j])

        # creating matrix A : contains rows of [item_factors of items in watched_list + '1' vector]
        A = np.zeros((0, self.cfg.num_factors))  # 3 is the number of features here = K

        for item in watched_items:
            A = np.vstack([A, self.item_factors[item]])
        v = np.ones((len(watched_items), 1))
        A = np.c_[A, v]

        wt = []
        for item in watched_items:
            rated = np.argwhere(self.ratings[:, item] != 0)  # list of users who have rated this place
            watched = np.intersect1d(rated, group.members)  # list of group members who have watched this place
            std_dev = np.std(self.ratings[:, item][self.ratings[:, item] != 0])  # std deviation for the rating of the item
            wt += [len(watched) / float(len(group.members)) * 1 / (1 + std_dev)]  # list containing diagonal elements
        W = np.diag(wt)  # diagonal weight matrix

        factor_n_bias = np.dot(np.linalg.inv(np.dot(np.dot(A.T, W),A) + lamb * np.identity(self.cfg.num_factors + 1)),
                               np.dot(np.dot(A.T, W), s_g))
        group.grp_factors_wbf = factor_n_bias[:-1]
        group.bias_wbf = factor_n_bias[-1]

        # Making recommendations on candidate list :
        group_candidate_ratings = {}
        for idx, item in enumerate(group.candidate_items):
            cur_rating = self.predict_group_rating(group, item, 'wbf')

            if (cur_rating > self.cfg.rating_threshold_wbf):
                group_candidate_ratings[item] = cur_rating

        # sort and filter to keep top 'num_recos_wbf' recommendations
        group_candidate_ratings = sorted(group_candidate_ratings.items(), key=lambda x: x[1], reverse=True)[
                                  :self.cfg.num_recos_wbf]

        group.reco_list_wbf = np.array([rating_tuple[0] for rating_tuple in group_candidate_ratings])

GroupRec.wbf_runner = wbf_runner

Evaluating methods......

In [52]:
def evaluation(self):
    # For AF
    af_precision_list = []
    af_recall_list = []
    print("\n#########-------For AF-------#########")
    for grp in self.groups:
        grp.generate_actual_recommendations(self.test_ratings, self.cfg.rating_threshold_af)
        (precision, recall, tp, fp) = grp.evaluate_af()
        af_precision_list.append(precision)
        af_recall_list.append(recall)

    af_mean_precision = np.nanmean(np.array(af_precision_list))
    af_mean_recall = np.nanmean(np.array(af_recall_list))
    print('\nAF method: mean precision: ', af_mean_precision)
    print('AF method: mean recall: ', af_mean_recall)

    # For BF
    bf_precision_list = []
    bf_recall_list = []
    print("\n#########-------For BF-------#########")
    for grp in self.groups:
        grp.generate_actual_recommendations(self.test_ratings, self.cfg.rating_threshold_bf)
        (precision, recall, tp, fp) = grp.evaluate_bf()
        bf_precision_list.append(precision)
        bf_recall_list.append(recall)

    bf_mean_precision = np.nanmean(np.array(bf_precision_list))
    bf_mean_recall = np.nanmean(np.array(bf_recall_list))
    print('\nBF method: mean precision: ', bf_mean_precision)
    print('BF method: mean recall: ', bf_mean_recall)

    # For WBF
    wbf_precision_list = []
    wbf_recall_list = []
    print("\n#########-------For WBF-------#########")
    for grp in self.groups:
        grp.generate_actual_recommendations(self.test_ratings, self.cfg.rating_threshold_wbf)
        (precision, recall, tp, fp) = grp.evaluate_wbf()
        wbf_precision_list.append(precision)
        wbf_recall_list.append(recall)

    wbf_mean_precision = np.nanmean(np.array(wbf_precision_list))
    wbf_mean_recall = np.nanmean(np.array(wbf_recall_list))
    print('\nWBF method: mean precision: ', wbf_mean_precision)
    print('WBF method: mean recall: ', wbf_mean_recall)
GroupRec.evaluation = evaluation

Running all our proposed methods and evaluating them altogether.

In [53]:
def run_all_methods(self, groups):
    if (groups is None):
        groups = self.groups
    #PS: could call them without passing groups as we have already added groups to grouprec object
    self.af_runner(groups, Aggregators.weighted_average)
    self.bf_runner(groups, Aggregators.average_bf)
    self.wbf_runner(groups, Aggregators.average_bf)

    #evaluation
    self.evaluation()
GroupRec.run_all_methods = run_all_methods

# MAIN PROGRAM

Pertama, faktorisasi matriks dengan metode Stochastic Gradient Descent (SGD). Jumlah iterasi diambil dari config dan MSE selama iterasi dilaporkan. Di sini hanya melakukan 3 iterasi dalam demo ini sehingga mse(error) lebih tinggi. Untuk hasil, saya telah melakukan lebih banyak iterasi untuk mendapatkan mse yang lebih rendah.

In [77]:
gr = GroupRec()
print(gr.cfg.max_iterations_mf)
gr.sgd_factorize()

Reading training data from...
Reading testing data...
3
Doing matrix factorization...
Iteration:  0
training mse:  0.6808502897310343
test mse:  1.5011080420873044
Iteration:  1
training mse:  0.4594587417300548
test mse:  1.3508815699939087
Iteration:  2
training mse:  0.3580973471489492
test mse:  1.2975625384201053


Generate small, medium and large groups.

In [78]:
#generate groups programmatically
#disjoint means none of the groups shares any common members

small_groups = Group.generate_groups(gr.cfg, gr.ratings, gr.test_ratings, gr.num_users, 3, gr.cfg.small_grp_size, disjoint=True)
medium_groups = Group.generate_groups(gr.cfg, gr.ratings, gr.test_ratings, gr.num_users, 3, gr.cfg.medium_grp_size, disjoint=True)
large_groups = Group.generate_groups(gr.cfg, gr.ratings, gr.test_ratings, gr.num_users, 3, gr.cfg.large_grp_size, disjoint=True)

group_set = [small_groups, medium_groups, large_groups]
group_type = ['small', 'medium', 'large']

for idx, groups in enumerate(group_set):
    if groups is []:
        continue

    # generated groups
    n = 3
    print('\n******* Running for ', group_type[idx], ' groups *************')
    print('generated groups (only first %d are getting printed here): ' % n)
    for group in groups[:n]:
        print(group.members)


******* Running for  small  groups *************
generated groups (only first 3 are getting printed here): 
[1, 34, 56]
[17, 23, 65]
[7, 13, 15]

******* Running for  medium  groups *************
generated groups (only first 3 are getting printed here): 
[7, 23, 32, 69, 73]
[4, 43, 67, 68, 70]
[6, 10, 11, 28, 54]

******* Running for  large  groups *************
generated groups (only first 3 are getting printed here): 
[3, 10, 15, 32, 40, 43, 45, 48, 63, 67]
[0, 11, 23, 24, 25, 34, 35, 36, 54, 65]
[2, 4, 33, 39, 44, 46, 60, 62, 69, 70]


Run all the methods (AF, BF and WBF) for all the 3 group sizes and report the results:

In [83]:
for idx, groups in enumerate(group_set):
    if groups is []:
        continue
    print('\n******* Running for ', group_type[idx], ' groups *************')
    gr.add_groups(groups)
    for i in groups:
        print(i.ratings_per_member)
    gr.run_all_methods(groups)
    gr.remove_groups(groups)


******* Running for  small  groups *************
[22, 20, 19]
[22, 17, 21]
[20, 21, 21]

#########-------For AF-------#########

AF method: mean precision:  0.8425925925925926
AF method: mean recall:  0.2981366459627329

#########-------For BF-------#########

BF method: mean precision:  0.8425925925925926
BF method: mean recall:  0.32850241545893716

#########-------For WBF-------#########

WBF method: mean precision:  0.8333333333333334
WBF method: mean recall:  0.31262939958592134

******* Running for  medium  groups *************
[20, 17, 19, 22, 35]
[22, 23, 21, 23, 18]
[21, 20, 16, 22, 18]

#########-------For AF-------#########

AF method: mean precision:  0.7071428571428572
AF method: mean recall:  0.15642071437399577

#########-------For BF-------#########

BF method: mean precision:  0.5992063492063492
BF method: mean recall:  0.11230997404523546

#########-------For WBF-------#########

WBF method: mean precision:  0.6944444444444445
WBF method: mean recall:  0.167531825485