In [75]:
import pandas as pd
from os.path import join
import json
from collections import Counter
KG_PATH = 'data/kg_data'


In [76]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [77]:
pd.read_csv(join(KG_PATH, 'triples', 'kg_triples_withjob_course_links_scaled.csv'))

Unnamed: 0,head,rel,tail,time_amin,time_amax,proba
0,coursera_--626KkxEeywagovoAKHOQ,_provides,hard_skills_31065,2005-01-01,2005-01-01,0.095001
1,coursera_-0BI9jXyEeWa2g6sjqf03Q,_provides,hard_skills_32073,2005-01-01,2005-01-01,0.000138
2,coursera_-0wI4W8lEeys9RJMWW48Yw,_provides,hard_skills_32603,2005-01-01,2005-01-01,0.001563
3,coursera_-1YwAnTLEeSjmyIAC0aXFg,_provides,hard_skills_32024,2005-01-01,2005-01-01,0.026412
4,coursera_-1cp1vgjEeyxiRKaoDccyw,_provides,hard_skills_31278,2005-01-01,2005-01-01,0.053806
...,...,...,...,...,...,...
1296369,coursera_FVG4FkylEeWnWw63bhv00w,_favors,job_titles_998,2007-06-07,2007-06-07,0.006341
1296370,coursera_HhtExGVdEeyQuAqyaULE4w,_favors,job_titles_998,2007-06-07,2007-06-07,0.006341
1296371,coursera_SqdC-eNsEeq9MQ5Dfss9mw,_favors,job_titles_998,2007-06-07,2007-06-07,0.006341
1296372,coursera_a19TzyN0Eeu-rg7jvs9-1w,_favors,job_titles_998,2007-06-07,2007-06-07,0.006341


In [78]:
def get_format_data(df_u):
    all_ents = list(set(list(df_u['head'])+list(df_u['tail'])))
    all_ents = {ent:idx for idx, ent in enumerate(all_ents)}
    all_rels = {rel:idx for idx, rel in enumerate(list(set(df_u['rel'])))}
    df_u['head'] = df_u['head'].map(all_ents)
    df_u['tail'] = df_u['tail'].map(all_ents)
    df_u['rel'] = df_u['rel'].map(all_rels)
    return df_u, all_ents, all_rels

def save_json(data, save_path):
    with open(save_path, 'w') as f:
        json.dump(data, f)

def read_json(path_):
    with open(path_, 'r') as f:
        data = json.load(f)
    return data

In [79]:
df_ = pd.read_csv(join(KG_PATH, 'triples', 'kg_triples_withjob_course_links_scaled.csv'))

In [80]:
df_all, all_ents, all_rels = get_format_data(df_u = df_)

In [81]:
df_all.to_csv(join(KG_PATH, 'split', 'full_kg_triples_train_format.csv'), index = False)

In [82]:
save_json(all_ents, join(KG_PATH, 'split', 'all_ents.json'))

In [83]:
save_json(all_ents, join(KG_PATH, 'split', 'all_rels.json'))

In [84]:
df_all.head()

Unnamed: 0,head,rel,tail,time_amin,time_amax,proba
0,13860,4,3414,2005-01-01,2005-01-01,0.095001
1,16516,4,17001,2005-01-01,2005-01-01,0.000138
2,12284,4,3076,2005-01-01,2005-01-01,0.001563
3,9218,4,8913,2005-01-01,2005-01-01,0.026412
4,19346,4,19485,2005-01-01,2005-01-01,0.053806


In [85]:
df_all['proba'].max()

1.0

In [86]:
df_all['proba'].mean()

0.012590191799520511

In [87]:
def clean_triples(df_):
    # remove triples with entities that appear less than 10 times
    all_ents = Counter(list(df_['head'])+list(df_['tail']))
    unwanted_ents = [i for i in all_ents if all_ents[i]<10]
    print('unwanted entities: ',len(unwanted_ents))
    return df_[~(df_['head'].isin(unwanted_ents) | df_['tail'].isin(unwanted_ents))]

def get_stratified_score(df_):
    all_ents = Counter(list(df_['head'])+list(df_['tail']))
    df_['scores'] = df_['head'].map(all_ents)+df_['tail'].map(all_ents)
    return df_

def get_split(df_):
    df_ = clean_triples(df_ = df_)
    df_ = get_stratified_score(df_ = df_)
    train_df = group_sampling(df_ = df_, sample_frac = 0.7)
    # train_df = df_.groupby(['head','tail']).sample(frac=0.7, weights=list(df_['scores']), random_state=1)
    val_test = df_[~df_.index.isin(train_df.index)]
    val_df = val_test.sample(frac=0.5, weights='scores', random_state=1)
    test_df = val_test[~val_test.index.isin(val_df.index)]
    return train_df, val_df, test_df
    
def group_sampling(df_):
    train_df = df_.groupby(['head', 'rel']).sample(frac = 0.8)
    test_val = df_[~df_.index.isin(train_df.index)]
    test_df = test_val.sample(frac = 0.5)
    val_df = test_val[~test_val.index.isin(test_df.index)]
    return train_df, test_df, val_df

def save_split(train_df, val_df, test_df):
    train_ents = set(list(train_df['head'])+list(train_df['tail']))
    train_rels = set(list(train_df['rel']))
    val_ents = set(list(val_df['head'])+list(val_df['tail']))
    val_rels = set(list(val_df['rel']))
    test_ents = set(list(test_df['head'])+list(test_df['tail']))
    test_rels = set(list(test_df['rel']))
    ent_map = list(set(list(train_ents)+list(val_ents)+list(test_ents)))
    rel_map = list(set(list(train_rels)+list(val_rels)+list(test_rels)))
    ent_map = {i:idx for idx, i in enumerate(ent_map)}
    rel_map = {i:idx for idx, i in enumerate(rel_map)}
    train_df['head'] = train_df['head'].map(ent_map)
    train_df['tail'] = train_df['tail'].map(ent_map)
    train_df['rel'] = train_df['rel'].map(rel_map)

    val_df['head'] = val_df['head'].map(ent_map)
    val_df['tail'] = val_df['tail'].map(ent_map)
    val_df['rel'] = val_df['rel'].map(rel_map)

    test_df['head'] = test_df['head'].map(ent_map)
    test_df['tail'] = test_df['tail'].map(ent_map)
    test_df['rel'] = test_df['rel'].map(rel_map)

    
    train_df = train_df[['head','rel','tail','proba']]
    val_df = val_df[['head','rel','tail','proba']]
    test_df = test_df[['head','rel','tail','proba']]
    # train_df = train_df[['head','rel','tail','proba']].rename(ren, axis='columns')
    # val_df = val_df[['head','rel','tail','proba']].rename(ren, axis='columns')
    # test_df = test_df[['head','rel','tail','proba']].rename(ren, axis='columns')
    # train_df = train_df[['head','rel','tail','proba']].rename(ren, axis='columns').to_csv(join(KG_PATH, 'split', 'train.csv'), index = False)
    # val_df = val_df[['head','rel','tail','proba']].rename(ren, axis='columns').to_csv(join(KG_PATH, 'split', 'val.csv'), index = False)
    # test_df = test_df[['head','rel','tail','proba']].rename(ren, axis='columns').to_csv(join(KG_PATH, 'split', 'test.csv'), index = False)
    return train_df, val_df, test_df

def save_train_test_val(train_df, val_df, test_df, save_path):
    ren = {'head':'h', 'rel':'r', 'tail':'t', 'proba':'p'}
    train_df = train_df[['head','rel','tail','proba']].rename(ren, axis='columns').to_csv(join(save_path, 'train.csv'), index = False)
    val_df = val_df[['head','rel','tail','proba']].rename(ren, axis='columns').to_csv(join(save_path, 'val.csv'), index = False)
    test_df = test_df[['head','rel','tail','proba']].rename(ren, axis='columns').to_csv(join(save_path, 'test.csv'), index = False)

def test_split(train_df, val_df, test_df):
    train_ents = set(list(train_df['head'])+list(train_df['tail']))
    train_rels = set(list(train_df['rel']))
    val_ents = set(list(val_df['head'])+list(val_df['tail']))
    val_rels = set(list(val_df['rel']))
    test_ents = set(list(test_df['head'])+list(test_df['tail']))
    test_rels = set(list(test_df['rel']))
    print('ents in val not in train', len(val_ents.difference(train_ents)))
    print('ents in test not in train', len(test_ents.difference(train_ents)))
    print('ents in train not in val', len(train_ents.difference(val_ents)))
    print('ents in train not in test', len(train_ents.difference(test_ents)))

    print('rels in val not in train', len(val_rels.difference(train_rels)))
    print('rels in test not in train', len(test_rels.difference(train_rels)))
    print('rels in train not in val', len(train_rels.difference(val_rels)))
    print('rels in train not in test', len(train_rels.difference(test_rels)))
    print('train size: ', train_df.shape[0])
    print('val size: ', val_df.shape[0])
    print('test size: ', test_df.shape[0])
    sum_num = train_df.shape[0]+val_df.shape[0]+test_df.shape[0]
    print('train %: ', train_df.shape[0]/sum_num)
    print('val %: ', val_df.shape[0]/sum_num)
    print('test %: ', test_df.shape[0]/sum_num)
    print('VOCAB_SIZE: ', len(set(list(train_ents)+list(val_ents)+list(test_ents))))
    print('REL_VOCAB_SIZE: ', len(set(list(train_rels)+list(val_rels)+list(test_rels))))
    train_ents = max(list(train_df['head'])+list(train_df['tail']))
    print(train_ents)
    train_rels = max(list(train_df['rel']))
    print(train_rels)
    val_ents = max(list(val_df['head'])+list(val_df['tail']))
    print(val_ents)
    val_rels = max(list(val_df['rel']))
    print(val_rels)
    test_ents = max(list(test_df['head'])+list(test_df['tail']))
    print(test_ents)
    test_rels = max(list(test_df['rel']))
    print(test_rels)

In [88]:
# group_sampling(df_ = df_all, sample_frac = 0.7)

In [89]:
train_df, test_df, val_df = group_sampling(df_ = df_all)

In [90]:
train_df.head()

Unnamed: 0,head,rel,tail,time_amin,time_amax,proba
5698,0,4,18679,2005-01-01,2005-01-01,0.003932
110016,1,8,8770,2013-03-18,2013-03-18,0.30637
110015,1,8,10190,2012-02-05,2012-02-05,0.30637
110012,1,8,16803,2011-10-21,2011-10-21,0.28435
110013,1,8,10934,2011-11-17,2012-11-26,0.469577


In [91]:
test_split(train_df, val_df, test_df)

ents in val not in train 0
ents in test not in train 0
ents in train not in val 8025
ents in train not in test 7970
rels in val not in train 0
rels in test not in train 0
rels in train not in val 1
rels in train not in test 1
train size:  1043912
val size:  126231
test size:  126231
train %:  0.8052552735553166
val %:  0.0973723632223417
test %:  0.0973723632223417
VOCAB_SIZE:  21189
REL_VOCAB_SIZE:  12
21188
11
21188
11
21188
11


In [92]:
# no need for this since we didnt remove the nodes that apper less than 10 times
# train_df, val_df, test_df = save_split(train_df, val_df, test_df)

In [93]:
train_df.head()

Unnamed: 0,head,rel,tail,time_amin,time_amax,proba
5698,0,4,18679,2005-01-01,2005-01-01,0.003932
110016,1,8,8770,2013-03-18,2013-03-18,0.30637
110015,1,8,10190,2012-02-05,2012-02-05,0.30637
110012,1,8,16803,2011-10-21,2011-10-21,0.28435
110013,1,8,10934,2011-11-17,2012-11-26,0.469577


In [94]:
# test_split(train_df, val_df, test_df)

In [95]:
save_train_test_val(train_df = train_df, val_df = val_df, test_df = test_df, save_path = join(KG_PATH, 'split'))