In [111]:
import numpy as np
import pandas as pd
import torch
import re
from sentence_transformers import SentenceTransformer

In [112]:
data_path = './datasets/recommendation.csv'
raw_data = pd.read_csv(data_path,low_memory=False)
vertex_data = raw_data[raw_data["_start"].isna()].dropna(axis=1,how="all")
edge_data = raw_data[raw_data["_start"].notna()].dropna(axis=1,how="all")
vertex_data["countries"].str.split(r'\W+')

0        [, USA, ]
1              NaN
2              NaN
3              NaN
4              NaN
           ...    
28858          NaN
28859          NaN
28860          NaN
28861          NaN
28862          NaN
Name: countries, Length: 28863, dtype: object

In [161]:
from enum import Enum
class ColType(Enum):
    NUMERICAL=0
    LARGE_NUMBER=1
    NUMBER_TO_BIN=2
    INDEX=3
    CATEGORY=4
    STRING=5
    
class StringEncoder(object):
    def __init__(self, model_name='all-MiniLM-L6-v2', device=None):
        self.device = device
        self.model = SentenceTransformer(model_name, device=device)

    @torch.no_grad()
    def __call__(self, df):
        x = self.model.encode(df.dropna().values, show_progress_bar=True,
                              convert_to_tensor=True, device=self.device)
        x = x.cpu()
        res = torch.zeros(len(df),x.size(dim=1))
        res[df.notnull().values,:] = x
        return x

class NumericalEncoder(object):
    def __init__(self):
        return

    def __call__(self, df,na_val=0):
        x = torch.tensor(df.fillna(value=na_val).values)
        return x

class LargeNumberEncoder(object):
    def __init__(self):
        return

    def __call__(self, df, na_val=0):
        x = torch.tensor(df.fillna(value=na_val).values)
        x = (x - torch.min(x)) / (torch.max(x) - torch.min(x))
        return x

class BinEncoder(object):
    def __init__(self):
        return

    def __call__(self, df, num_bins = 10):
        cut = pd.qcut(df, num_bins)
        genres = set(genre for genre in cut.values) 
        # keep NaN as a bin
        genres_mapping = {genre: i for i, genre in enumerate(genres)}
        
        x = torch.zeros(len(df), len(genres))
        for i, genre in enumerate(cut.values):
            x[i, genres_mapping[genre]] = 1
        #print(torch.sum(torch.sum(x,dim=1)!=1))
        return x

class IndexEncoder(object):
    def __init__(self):
        return
    
    def __call__(self, df):
        mapping = {index: i for i, index in enumerate(df.astype('int64').values)}
        return mapping
    
class CategoryEncoder(object):
    def __init__(self):
        return
        
    def __call__(self, df):
        genres = set(g for col in df.dropna().values for g in re.split('[^a-zA-Z0-9 ]', col))
        genres.discard('')
        genres_mapping = {genre: i for i, genre in enumerate(genres)}

        x = torch.zeros(len(df), len(genres))
        for i, col in enumerate(df.dropna().values):
            for genre in re.split('[^a-zA-Z0-9 ]', col):
                if genre == '':
                    continue
                x[i, genres_mapping[genre]] = 1
        return x

encoders = [NumericalEncoder(),LargeNumberEncoder(),BinEncoder(),IndexEncoder(),CategoryEncoder(),StringEncoder()]

In [160]:
'''
genres = set(g for col in vertex_data["countries"].dropna().values for g in re.split('[^a-zA-Z0-9 ]', col))
genres.discard('')
genres.discard('')
print(genres)
'''
#pd.qcut(vertex_data["imdbRating"], 10).unique()
vertex_data["imdbRating"].isnull().astype(int).values

array([0, 1, 1, ..., 1, 1, 1])

In [131]:
vertex_data["_id"].astype('int64').values

array([    0,     1,     2, ..., 33877, 33878, 33879], dtype=int64)

In [165]:
v_columns_info = {"_id": ColType.INDEX,"_labels":ColType.CATEGORY,"budget":ColType.LARGE_NUMBER,"countries":ColType.CATEGORY,"imdbRating":ColType.NUMERICAL,"imdbVotes":ColType.NUMBER_TO_BIN ,"name":ColType.STRING,"plot":ColType.STRING,"revenue":ColType.LARGE_NUMBER,"runtime":ColType.NUMBER_TO_BIN,"title":ColType.STRING,"year":ColType.NUMBER_TO_BIN}

vertex_data=vertex_data[v_columns_info.keys()]
v_features=[]
for col in vertex_data.keys() :
    if v_columns_info[col] == ColType.INDEX:
        encoder = IndexEncoder()
        mapping = encoder(vertex_data[col])
    elif v_columns_info[col] == ColType.CATEGORY:
        encoder = CategoryEncoder()
        v_features.append(encoder(vertex_data[col]))
    elif v_columns_info[col] == ColType.NUMERICAL:
        encoder = NumericalEncoder()
        v_features.append(encoder(vertex_data[col]))
    elif v_columns_info[col] == ColType.LARGE_NUMBER:
        encoder = LargeNumberEncoder()
        v_features.append(encoder(vertex_data[col]))
    elif v_columns_info[col] == ColType.NUMBER_TO_BIN:
        encoder = BinEncoder()
        v_features.append(encoder(vertex_data[col]))
    elif v_columns_info[col] == ColType.STRING:
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        encoder = StringEncoder(device=device)
        tmp = encoder(vertex_data[col])
        v_features.append(tmp)







        
        

Batches:   0%|          | 0/617 [00:00<?, ?it/s]

Batches:   0%|          | 0/284 [00:00<?, ?it/s]

Batches:   0%|          | 0/286 [00:00<?, ?it/s]

In [169]:
torch.min(tmp)

tensor(-0.2967)

In [18]:

class DataLib(object):
    def __init__(self):
        
    def get_data_info(name):
        if self.dataset_name == "recommendation":
            return {}
        else:
            raise NotImplementedError


{}