In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

%load_ext cython

import os
import os.path
import numpy as np
import lightgbm
import matplotlib.pyplot as plt

# see http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_svmlight_file.html
from sklearn.datasets import load_svmlight_file 


## **CONVERT AND LOAD DATA**

In [3]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [9]:
DATASET_FOLDER = ("/content/drive/MyDrive/PROYEK STBI/folder1")
PERM_FOLDER = DATASET_FOLDER + "perms/"
METRIC_NAME = 'ndcg@10'

In [5]:
def ensureFile(path):
    if not os.path.exists(path) or not os.path.isfile(path):
        raise FileNotFoundError("'" + path + "': no such file")        
    return path

def retrieveFileNames():
    folder = DATASET_FOLDER + '/' if DATASET_FOLDER[-1:] != '/' else DATASET_FOLDER
    train_file = ensureFile(folder + "train.txt")
    valid_file = ensureFile(folder + "vali.txt")
    test_file = ensureFile(folder + "test.txt")
    return train_file, valid_file, test_file

def loadDataset(path):
    return load_svmlight_file(path, query_id=True)

def loadLightGBM(svmlight_dataset):
    query_lens = [sum(1 for _ in group) for key, group in itertools.groupby(svmlight_dataset[2])]
    return lightgbm.Dataset(data=svmlight_dataset[0], label=svmlight_dataset[1], group=query_lens)

In [6]:
class Query:
    def __init__(self, qid):
        self.qid = qid
        self.labels_to_docs = {}
    def addlabel(self, label):
        if not label in self.labels_to_docs:
            self.labels_to_docs[label] = list()
    def adddoc(self, label, doc):
        self.labels_to_docs[label].append(doc)
    def finalize(self, alllabels):
        self.labels = np.zeros(len(self.labels_to_docs.keys()), dtype=int)
        self.docs = np.empty(len(self.labels_to_docs.keys()), dtype=object)
        i = 0
        totaldocs = 0
        sorteddict = sorted(self.labels_to_docs.items(), reverse = True)
        for label, docs in sorteddict:
            self.labels[i] = label
            self.docs[i] = np.zeros(len(docs), dtype=int)
            for j in range(len(docs)):
                self.docs[i][j] = docs[j]
            i += 1
            totaldocs += len(docs)
        self.alldocs = np.concatenate(self.docs)
        self.flatlabels = np.zeros(totaldocs, dtype=np.double)
        i = 0
        for label, docs in sorteddict:
            for j in range(len(docs)):
                self.flatlabels[i] = label
                i += 1       
        k = min(10, len(self.alldocs))
        self.idealdcg = dcg_k(self.alldocs, alllabels, k) 
        del self.labels_to_docs
    def setperms(self, perms):
        self.perms = perms
    def setndcgs(self, ndcgs):
        self.ndcgs = ndcgs
    def __repr__(self):  
        return str(self)
    def __str__(self):
        res = "Query " + str(self.qid) + "["
        res += "\nideal dcg: " + str(self.idealdcg)
        for i in range(len(self.labels)):
            res += "\n" + str(self.labels[i]) + " -> " + str(self.docs[i])
        res += "]"
        if hasattr(self, 'perms'):
            for i in range(len(self.perms)):
                res += "\n[" + str(self.perms[i]) + "] -> dcg: " + str(self.ndcgs[i])
        else:
            res += "\nNo permutations computed yet"
        return res

In [8]:
def mapQueryToDocuments(dataset):
    queries = {}
    alllabels = np.negative(np.ones(len(dataset[2]), dtype=np.double))
    for i in range(0, len(dataset[2])):
        if not dataset[2][i] in queries:
            queries[dataset[2][i]] = Query(dataset[2][i])
        query = queries[dataset[2][i]]
        query.addlabel(dataset[1][i])
        query.adddoc(dataset[1][i], i)
        alllabels[i] = dataset[1][i]
        
    for q in queries.values():
        q.finalize(alllabels)
    
    return queries, alllabels