In [1]:
import sys
import os
import torch
import cornac
import papermill as pm
import scrapbook as sb
import pandas as pd
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.models.cornac.cornac_utils import predict_ranking,predict
from recommenders.utils.timer import Timer
from recommenders.utils.constants import SEED

print("System version: {}".format(sys.version))
print("PyTorch version: {}".format(torch.__version__))
print("Cornac version: {}".format(cornac.__version__))

System version: 3.8.3 (default, Jul  2 2020, 16:21:59) 
[GCC 7.3.0]
PyTorch version: 1.13.0+cu117
Cornac version: 1.14.2


In [2]:
# top k items to recommend
TOP_K = 10

# Model parameters
LATENT_DIM = 50
ENCODER_DIMS = [100]
ACT_FUNC = "tanh"
LIKELIHOOD = "pois"
NUM_EPOCHS = 500
BATCH_SIZE = 128
LEARNING_RATE = 0.001

#所需的欄位
usecols = ["userID", "itemID", "rating"]
data = pd.read_csv('preprocessed_train.csv', usecols=usecols)

In [3]:
data.head()

Unnamed: 0,userID,itemID,rating
0,5bdexxxxxx,5f19trnytn6f3ee24cf,100
1,5fedf9ssdddebynuiuuiuiue,5bfrtntyte0020e4b0e4dffv,100
2,5fedf958artrtgrghcfdfdrt,5fc4a3trynytya03cc0d4555,98
3,6rerer58dmtyiuocui458hje,5560ddergsfthby64ww9apoo,96
4,5fedf958afrbyc8rt456362c,5fd9b1ce0bytyuyj5ny28d5b,94


In [4]:
train_set = cornac.data.Dataset.from_uir(data.itertuples(index=False), seed=SEED)

print('Number of users: {}'.format(train_set.num_users))
print('Number of items: {}'.format(train_set.num_items))

Number of users: 59737
Number of items: 664


In [5]:
bivae = cornac.models.BiVAECF(
    k=LATENT_DIM,
    encoder_structure=ENCODER_DIMS,
    act_fn=ACT_FUNC,
    likelihood=LIKELIHOOD,
    n_epochs=NUM_EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    seed=SEED,
    use_gpu=torch.cuda.is_available(),
    verbose=True
)

In [None]:
with Timer() as t:
    bivae.fit(train_set)
print("Took {} seconds for training.".format(t))

In [None]:
bivae.save("./")

In [6]:
bivae.load("./BiVAECF")

<cornac.models.bivaecf.recom_bivaecf.BiVAECF at 0x7fc6780601c0>

In [7]:
import pickle

with open("bivae-bivae.pickle","rb") as f:
    bivae.bivae = pickle.load(f)

In [8]:
with open("bivae-train_set.pickle","rb") as f:
    bivae.train_set = pickle.load(f)

In [9]:
with Timer() as t:
    all_predictions = predict_ranking(bivae, data, usercol='userID', itemcol='itemID')
print("Took {} seconds for prediction.".format(t))

Took 6.4028 seconds for prediction.


In [10]:
print(all_predictions)

                            userID                    itemID  prediction
0                         starburst                   sword    0.006866
1                         starburst                   sao      0.002969
2                         starburst                   kirito   0.003349
3                         starburst                   asuna    0.001438
4                         starburst                   online   0.013209
...                            ...                       ...         ...
39665363                  starburststream             48763    0.000090
39665364                  starburststream             36784    0.000053
39665365                  starburststream             487634s  0.000051
39665366                  starburststream             beater   0.000059
39665367                  starburststream             487sss3  0.000060

[39665368 rows x 3 columns]


In [13]:
import csv
def find10class(courseID,prediction):
    best10 = []
    for i in range(10):
        index = prediction.index(max(prediction))
        best10.append(courseID[index])
        #print(courseID[index])
        prediction[index] = -1 #相當於刪除
    return best10

def append_csv(userID,Best10):
    with open(r'./predict10Total.csv',mode='a',newline='',encoding='utf8') as cfa:
        wf = csv.writer(cfa)
        id_str = ""
        for i in range(10):
            id_str +=Best10[i]
            id_str += "  "
        wf.writerow([userID,id_str])



times = 1
currentID = all_predictions.userID[0] #上一格all_predictions那裏要先確定第一個是0開始
courseID = []
prediction = []
tens_thousand = 0 #有多少個10萬
for i in range(39665368):  #39665368是上一格all_predictions總數
    userID = all_predictions.userID[i]
    if(currentID!=userID):
        #print(f"currentID = {currentID}:")
        best10 = find10class(courseID,prediction)
        append_csv(currentID,best10)
        currentID=userID #下面一位
        courseID = []
        prediction = []

    courseID.append(all_predictions.itemID[i])
    prediction.append(all_predictions.prediction[i]) #預估評分
    
    if(times/100000==1):
        tens_thousand +=1
        print(f"time = {tens_thousand} * 10萬") #每10萬輸出一次,如果要每一次一次都輸出會out of memory,所以才要改成每10萬,也可以自行調整
        times = 1
    
    times+=1

time = 1 * 10萬
time = 2 * 10萬
time = 3 * 10萬
time = 4 * 10萬
time = 5 * 10萬
time = 6 * 10萬
time = 7 * 10萬
time = 8 * 10萬
time = 9 * 10萬
time = 10 * 10萬
time = 11 * 10萬
time = 12 * 10萬
time = 13 * 10萬
time = 14 * 10萬
time = 15 * 10萬
time = 16 * 10萬
time = 17 * 10萬
time = 18 * 10萬
time = 19 * 10萬
time = 20 * 10萬
time = 21 * 10萬
time = 22 * 10萬
time = 23 * 10萬
time = 24 * 10萬
time = 25 * 10萬
time = 26 * 10萬
time = 27 * 10萬
time = 28 * 10萬
time = 29 * 10萬
time = 30 * 10萬
time = 31 * 10萬
time = 32 * 10萬
time = 33 * 10萬
time = 34 * 10萬
time = 35 * 10萬
time = 36 * 10萬
time = 37 * 10萬
time = 38 * 10萬
time = 39 * 10萬
time = 40 * 10萬
time = 41 * 10萬
time = 42 * 10萬
time = 43 * 10萬
time = 44 * 10萬
time = 45 * 10萬
time = 46 * 10萬
time = 47 * 10萬
time = 48 * 10萬
time = 49 * 10萬
time = 50 * 10萬
time = 51 * 10萬
time = 52 * 10萬
time = 53 * 10萬
time = 54 * 10萬
time = 55 * 10萬
time = 56 * 10萬
time = 57 * 10萬
time = 58 * 10萬
time = 59 * 10萬
time = 60 * 10萬
time = 61 * 10萬
time = 62 * 10萬
time = 63 * 10萬
t

In [None]:
#上面是把整個train的用戶都預測,然後這裡是把test_seen裡面的用戶挑出來放到csv裡,以方便丟到kaggle

seen_name = []
predict_test = {}
with open('./data/test_seen.csv', newline='') as csvfile:

    # 讀取 CSV 檔案內容
    reader = csv.reader(csvfile)

    #避免輸入第一行的標題
    first = True

    # 以迴圈輸出每一列
    for row in reader:
        if(first==True):
            first = False
        else:
            predict_test[row[0]] = str(row[1]).split()
            seen_name.append(row[0])

predict_total = {}          
total_name = []
with open('./predict10Total.csv', newline='') as csvfile:

    # 讀取 CSV 檔案內容
    reader = csv.reader(csvfile)

    #避免輸入第一行的標題
    first = True

    # 以迴圈輸出每一列
    for row in reader:
        if(first==True):
            first = False
        else:
            predict_total[row[0]] = str(row[1]).split()
            total_name.append(row[0])


with open('./predict_test_seen.csv', 'w', newline='') as csvfile:

    # 寫入 CSV 檔案內容
    writer = csv.writer(csvfile)

    writer.writerow(['user_id', 'course_id'])

    # 以迴圈輸出每一列
    for name in seen_name:
        if(name in total_name):
            course = ""
            for i in predict_total[name]:
                course += i
                course += "  "
            writer.writerow([name, course])
        else:
            print("none")