In [3]:
from collections import defaultdict
import pandas as pd
from tqdm.auto import tqdm
from tqdm.contrib import tzip
from numba import jit
import multiprocessing as mp
import numpy as np

In [4]:
class DiGraph:
    def __init__(self):
        self.nodes = set()
        self.edges = []

    def add_edge(self, u,v,**kwargs):
        self.edges.append([u,v,kwargs])
        self.nodes.add(u)
        self.nodes.add(v)
    
    def size(self):
        return len(self.nodes)
    
    def to_bin_array(self):
        sh = len(self.nodes)
        g = np.zeros((sh,sh), np.float32)
        for edge in self.edges:
            g[edge[0]][edge[1]] = 1
            #g[edge[1]][edge[0]] = 1
        return g
    
    def to_array(self):
        sh = len(self.nodes)
        g = np.zeros((sh,sh), np.float32)
        for edge in self.edges:
            x1,x2,x3,t = edge[2]['x1'],edge[2]['x2'],edge[2]['x3'],edge[2]['t']
            p = np.log10(max(0,t) + 10)
            g[edge[0]][edge[1]] = np.log(1 + edge[2]['x1'] + edge[2]['x2'] * 0.5 + edge[2]['x3'] * 0.5) + (1/p)
        return g

@jit(nopython=True)
def fast_to_array(ego_net):
    sh = ego_net.size()
    g = np.zeros((sh,sh), np.float32)
    for edge in ego_net.ego_net:
        g[edge[0]][edge[1]] = np.log(1 + edge[2]['x1'] + edge[2]['x2'] * 0.5 + edge[2]['x3'] * 0.5)
    return g

In [5]:
def read_label(label_path):
    with open(label_path, 'r') as label_f:
        label_f.readline()
        cur_ego_id = -1
        cur_label = None
        for line in label_f:
            ego_id, u, v = list(map(int, line.split(",")))
            if ego_id != cur_ego_id:
                if cur_ego_id != -1:
                    yield cur_ego_id, cur_label
                cur_ego_id, cur_label = ego_id, []
            cur_label.append((u, v))
        if cur_ego_id != -1 and len(cur_label) > 0:
            yield cur_ego_id, cur_label

In [6]:
def read_ego_net(ego_net_path):
    cur_ego_id = -1
    cur_ego_net = None
    with open(ego_net_path, 'r') as ego_net_f:
        ego_net_f.readline()
        for ego_line in ego_net_f:
            ego_line = ego_line.split(',')
            ego_id, u, v, t, x1, x2, x3 = int(ego_line[0]), int(ego_line[1]), int(ego_line[2]), int(ego_line[3]), float(ego_line[4]), float(ego_line[5]), float(ego_line[6])
            if ego_id != cur_ego_id:
                if cur_ego_id != -1:
                    yield cur_ego_id, cur_ego_net
                assert cur_ego_id <= ego_id
                cur_ego_id = ego_id
                cur_ego_net = DiGraph()
            cur_ego_net.add_edge(u, v, t=t, x1=x1, x2=x2, x3=x3)
        if cur_ego_net.size() > 0 and cur_ego_id != -1:
            yield cur_ego_id, cur_ego_net

In [7]:
def recommend(chunk):
    ego_id,ego_net = chunk
    mtr = ego_net.to_array()
    mtr_bin = ego_net.to_bin_array()
    
    mtr[0, :] = 0
    mtr[:, 0] = 0
    mtr += mtr.T
    
    mtr_bin[0, :] = 0
    mtr_bin[:, 0] = 0
    mtr_bin += mtr_bin.T

    out_degree = mtr_bin.sum(axis=1).reshape((-1, 1))
    aa = (mtr).dot(mtr.T) * (1 - 200 * np.eye(len(mtr_bin))) * (1 - 200 * mtr_bin)
    recs = list() #set()
    for i in aa.flatten().argsort()[::-1]:
        u, v = min(i // len(mtr), i % len(mtr)), max(i // len(mtr), i % len(mtr))
        if u < v and u != 0:
            if (u,v) not in recs:
                recs.append((u, v))
        if len(recs) == 5:
            break
    return recs,ego_id

In [8]:
def recommend(chunk):
    ego_id,ego_net = chunk
    E = ego_net.edges
    mtr = ego_net.to_bin_array()
    mtr[0, :] = 0
    mtr[:, 0] = 0
    mtr1 = (mtr + mtr.T)/2
    mtr2 = (mtr + mtr.T)/2
#     time_connects = mtr
    mtr_x1 = np.zeros_like(mtr1)
    mtr_x2 = np.zeros_like(mtr1)
    mtr_x3 = np.zeros_like(mtr1)

    for x in E:
        mtr_x1[x[0], x[1]] = x[2]["x1"]
        mtr_x2[x[0], x[1]] = x[2]["x2"]
        mtr_x3[x[0], x[1]] = x[2]["x3"]
        if x[2]["t"] == -1:
            t = 10000000
        else:
            t = x[2]["t"]
        mtr1[x[0], x[1]] *= 1/np.log(((t/25)**2)+2)  + np.log(x[2]["x1"] +1)*0.2 + np.log(x[2]["x2"] +1)*0.2+ np.log(x[2]["x3"] +1)*0.5
        mtr2[x[0], x[1]] *= 1/np.log(((t/18)**2)+2)  + np.log(x[2]["x1"] +1)*0.2 + np.log(x[2]["x2"] +1)*0.2+ np.log(x[2]["x3"] +1)*0.5
    out_degree = mtr.sum(axis=1).reshape((-1, 1))

    mtr_x1_weight = mtr_x1.T.dot(mtr1) * (1 - 100 * np.eye(len(mtr2)))  * (1 - 100 * mtr1)
    mtr_x2_weight = mtr_x2.T.dot(mtr1) * (1 - 100 * np.eye(len(mtr2)))  * (1 - 100 * mtr1)
    mtr_x3_weight = mtr_x3.T.dot(mtr1) * (1 - 100 * np.eye(len(mtr2)))  * (1 - 100 * mtr1)


    mtr_norm = (mtr / (1 + np.log(1 + out_degree)))
    mtr1_norm = (mtr1 / (1 + np.log(1 + out_degree)))
    mtr2_norm = (mtr2 / (1 + np.log(1 + out_degree)))

    aa = mtr1_norm.T.dot(mtr1_norm) * (1 - 100 * np.eye(len(mtr1))) * (1 - 100 * mtr1)
    aa += mtr2_norm.T.dot(mtr2_norm) * (1 - 100 * np.eye(len(mtr1))) * (1 - 100 * mtr1)
    
    mtr_x1_weight = np.mean([mtr_x1_weight, mtr_x1_weight.T], axis=0)    
    mtr_x2_weight = np.mean([mtr_x2_weight, mtr_x2_weight.T], axis=0)    
    mtr_x3_weight = np.mean([mtr_x3_weight, mtr_x3_weight.T], axis=0)    
    

    aa += mtr_x1_weight*0.05
    aa += mtr_x2_weight*0.01
    aa += mtr_x3_weight*0.05
    
    aa = np.mean([aa, aa.T], axis=0)    

    recs = list()
    for i in aa.flatten().argsort()[::-1]:
        u, v = min(i // len(mtr), i % len(mtr)), max(i // len(mtr), i % len(mtr))
        
        if u < v and u != 0 and (u,v) not in recs:
            recs.append((u, v))
        if len(recs) == 5:
            break
    return recs,ego_id

In [None]:
def main():
    with open('./submissionVC_kuz02.csv', 'w') as out:
        out.write('ego_id,u,v\n')
        pool = mp.Pool(mp.cpu_count())
        iterator = read_ego_net('/kaggle/input/vkcup-final/final/data/ego_net_te.csv')
        recs_batch = pool.map(recommend,tqdm(iterator,total=20572))
        for recs,ego_id in tqdm(recs_batch):
            for (u, v) in recs:
                out.write('{},{},{}\n'.format(ego_id, u, v))

if __name__ == '__main__':
    main()

  0%|          | 0/20572 [00:00<?, ?it/s]