In [93]:
import pandas as pd
import networkx as nx
from os.path import join
import numpy as np
from networkx.generators import intersection

KG_PATH = 'data/kg_data'

In [94]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [95]:
def get_entity_contrib(df_):
    # head_ = df_['head'].value_counts().to_dict()
    head_ = df_.groupby(['head'])['count_col_sum'].agg('sum').to_dict()
    # return head_
    # tail_ = df_['tail'].value_counts().to_dict()
    tail_ = df_.groupby(['tail'])['count_col_sum'].agg('sum').to_dict()
    return {k: tail_.get(k, 0) + tail_.get(k, 0) for k in set(head_) | set(tail_)}

def get_triples_probability_scores(df_):
    ent_rep = get_entity_contrib(df_)
    df_['head_c'] = df_['head'].map(ent_rep)
    df_['tail_c'] = df_['tail'].map(ent_rep)
    # df_['proba'] =  df_['head_c'] + df_['tail_c']
    df_['proba'] =  df_['tail_c']
    df_['proba'] = df_['count_col_sum']/df_['proba']
    return df_

def get_good_triples(triples_df):
    triples_df['count_col'] = 1
    triples_df['time'] = pd.to_datetime(triples_df['time'])
    grouped = triples_df.groupby(['head','tail','rel']).agg({'count_col' : np.sum, 'time' : [np.min, np.max]})
    grouped.columns = grouped.columns.map('_'.join).str.strip('_')
    grouped = grouped.reset_index()
    triples_ = get_triples_probability_scores(df_ = grouped)
    return triples_[['head','tail','rel','time_amin','time_amax','proba']]
    
# def get_jobs_courses_links(triples_df):

def get_graph(df_triples):
    G = nx.Graph()  # or DiGraph, MultiGraph, MultiDiGraph, etc
    G.add_weighted_edges_from(df_triples[['head','tail', 'proba']].values)
    return G

# def get_mutual_clustering_coefficiens(G):
def coefs_calc(G):
    for node_i in G.nodes():
        for node_j in G.nodes():
            if node_i != node_j:
                node_i_neigh = set(nx.all_neighbors(G, node_i))
                node_j_neigh = set(nx.all_neighbors(G, node_j))
                print(node_i_neigh, node_j_neigh)
                ji = JI(node_i_neigh, node_j_neigh)
                mm = MeetMin(node_i_neigh, node_j_neigh)
                geo = Geometric(node_i_neigh, node_j_neigh)
                hgeo = Hypergeometric(node_i_neigh, node_j_neigh, len(G.nodes()))
                return ji, mm, geo, hgeo
def JI(set_1, set_2):
    inter = set_1.intersection(set_2)
    uni = set_1.union(set_2)
    return len(inter)/len(uni)

def MeetMin(set_1, set_2):
    inter = set_1.intersection(set_2)
    uni = min([len(set_1), len(set_2)])
    return len(inter)/uni

def Geometric(set_1, set_2):
    inter = len(set_1.intersection(set_2))**2
    uni = len(set_1)* len(set_2)
    return inter/uni

def factorial(n):
  if n ==1: return 1
  else: return n * factorial(n-1)

def combinations_without_repitition(n, r):
  return (factorial(n) / (factorial(r) * factorial(n -r )))

def Hypergeometric(set_1, set_2, total):
  inter = len(set_1.intersection(set_2))
  uni = min([len(set_1), len(set_2)])
  sum = 0
  for i in range(uni, inter):
    c1 = combinations_without_repitition(len(set_1), i)
    c2 = combinations_without_repitition((total - len(set_1)), (len(set_2) - i))
    c3 = combinations_without_repitition(total, len(set_2))
    sum += ((c1*c2)/c3)
  return -np.log(sum)

def  proba_scaling(df_):
  df_['proba'] = (df_['proba'] - df_['proba'].min()) / (df_['proba'].max() - df_['proba'].min())
  return df_

In [96]:
def get_job_course_links(triples_df):
    all_triples = []
    jo_hs = triples_df[triples_df['head'].str.startswith("job_titles") & triples_df['tail'].str.startswith("hard_skills")]
    co_hs = triples_df[triples_df['head'].str.startswith("coursera") & triples_df['tail'].str.startswith("hard_skills")]
    triples_df_xo = jo_hs.set_index('tail').join(co_hs.set_index('tail'), how='inner', lsuffix='_jo_hs', rsuffix='_co_hs')
    triples_df_xo = triples_df_xo.reset_index().groupby(['head_jo_hs', 'head_co_hs']).agg(
        {'proba_jo_hs' : np.mean, 'proba_co_hs' : np.mean, 'time_amin_jo_hs' : np.min, 'time_amax_jo_hs' : np.max})
    triples_df_xo['proba'] = (triples_df_xo['proba_jo_hs']+triples_df_xo['proba_co_hs'])/2
    triples_df_xo = triples_df_xo.reset_index()
    del triples_df_xo['proba_jo_hs']
    del triples_df_xo['proba_co_hs']
    triples_df_xo = triples_df_xo.rename({'head_jo_hs': 'head', 'head_co_hs':'tail', 'time_amin_jo_hs':'time_amin', 	'time_amax_jo_hs':'time_amax', 'proba':'proba'}, axis = 'columns')
    triples_df_xo['rel'] = '_requires'
    all_triples.append(triples_df_xo.copy())
    # triples_df_xo
    # triples_df_xo.rename({'head': 'tail', 'tail':'head'}, axis = 'columns')
    # head_data = triples_df_xo['head'].copy()
    triples_df_xo['tail'], triples_df_xo['head'] = triples_df_xo['head'].copy(), triples_df_xo['tail'].copy()
    triples_df_xo['rel'] = '_favors'
    all_triples.append(triples_df_xo)
    triples_df_xo = pd.concat(all_triples)
    return triples_df_xo
 	

## Calculate Simple Proba

In [97]:
# triples_df = pd.read_csv(join(KG_PATH,'triples', 'raw_triples.csv'))
triples_df_ = pd.read_csv(join(KG_PATH,'triples', 'kg_triples.csv'))
triples_df_ = triples_df_.rename({'total_c':'proba'}, axis = 1)
triples_df_ = triples_df_[['head','tail','rel','time_amin','time_amax','proba']]

In [98]:
triples_df_.head()

Unnamed: 0,head,tail,rel,time_amin,time_amax,proba
0,coursera_--626KkxEeywagovoAKHOQ,hard_skills_31065,_provides,2005-01-01,2005-01-01,0.02381
1,coursera_-0BI9jXyEeWa2g6sjqf03Q,hard_skills_32073,_provides,2005-01-01,2005-01-01,0.0001
2,coursera_-0wI4W8lEeys9RJMWW48Yw,hard_skills_32603,_provides,2005-01-01,2005-01-01,0.000456
3,coursera_-1YwAnTLEeSjmyIAC0aXFg,hard_skills_32024,_provides,2005-01-01,2005-01-01,0.006667
4,coursera_-1cp1vgjEeyxiRKaoDccyw,hard_skills_31278,_provides,2005-01-01,2005-01-01,0.013514


In [99]:
triples_df_['proba'].min()

1.4030755415871593e-05

In [100]:
triples_df_['proba'].max()

0.25

In [101]:
triples_df_.shape

(138802, 6)

In [102]:
# triples_df_ = get_good_triples(triples_df_)

In [103]:
# triples_df_.head()

In [104]:
triples_df_xo_ = get_job_course_links(triples_df = triples_df_.copy())

In [105]:
triples_df_xo_

Unnamed: 0,head,tail,time_amin,time_amax,proba,rel
0,job_titles_0,coursera_Hfv9gzTZEea_2AoR0P8XnQ,2017-05-06,2017-05-06,0.007038,_requires
1,job_titles_1,coursera_5sELsjzeEey2HhKRvGWgpQ,2006-02-02,2006-02-02,0.000098,_requires
2,job_titles_1,coursera_7g9jQd_lEeeJCBKWimXVMA,2006-02-02,2006-02-02,0.000098,_requires
3,job_titles_1,coursera_LYEfxbbNEeuTjg6-_zYGCQ,2006-02-02,2006-02-02,0.000098,_requires
4,job_titles_1,coursera_MgA0oBGMEeuwRg7bdee3_w,2006-02-02,2006-02-02,0.000098,_requires
...,...,...,...,...,...,...
578781,coursera_FVG4FkylEeWnWw63bhv00w,job_titles_998,2007-06-07,2007-06-07,0.000513,_favors
578782,coursera_HhtExGVdEeyQuAqyaULE4w,job_titles_998,2007-06-07,2007-06-07,0.000513,_favors
578783,coursera_SqdC-eNsEeq9MQ5Dfss9mw,job_titles_998,2007-06-07,2007-06-07,0.000513,_favors
578784,coursera_a19TzyN0Eeu-rg7jvs9-1w,job_titles_998,2007-06-07,2007-06-07,0.000513,_favors


In [106]:
all_triples_df = pd.concat([triples_df_, triples_df_xo_])

In [107]:
all_triples_df.to_csv(join(KG_PATH, 'triples', 'kg_triples_withjob_course_links.csv'), index = False)

In [108]:
all_triples_df.shape

(1296374, 6)

## Probability Scaling

In [109]:
triples_df_ = pd.read_csv(join(KG_PATH,'triples', 'kg_triples_withjob_course_links.csv'))

In [110]:
triples_df_

Unnamed: 0,head,tail,rel,time_amin,time_amax,proba
0,coursera_--626KkxEeywagovoAKHOQ,hard_skills_31065,_provides,2005-01-01,2005-01-01,0.023810
1,coursera_-0BI9jXyEeWa2g6sjqf03Q,hard_skills_32073,_provides,2005-01-01,2005-01-01,0.000100
2,coursera_-0wI4W8lEeys9RJMWW48Yw,hard_skills_32603,_provides,2005-01-01,2005-01-01,0.000456
3,coursera_-1YwAnTLEeSjmyIAC0aXFg,hard_skills_32024,_provides,2005-01-01,2005-01-01,0.006667
4,coursera_-1cp1vgjEeyxiRKaoDccyw,hard_skills_31278,_provides,2005-01-01,2005-01-01,0.013514
...,...,...,...,...,...,...
1296369,coursera_FVG4FkylEeWnWw63bhv00w,job_titles_998,_favors,2007-06-07,2007-06-07,0.000513
1296370,coursera_HhtExGVdEeyQuAqyaULE4w,job_titles_998,_favors,2007-06-07,2007-06-07,0.000513
1296371,coursera_SqdC-eNsEeq9MQ5Dfss9mw,job_titles_998,_favors,2007-06-07,2007-06-07,0.000513
1296372,coursera_a19TzyN0Eeu-rg7jvs9-1w,job_titles_998,_favors,2007-06-07,2007-06-07,0.000513


In [111]:
def add_type(txt):
  types = ['hard_skills', 'coursera', 'job_titles', 'recruiters', 'soft_skills', 'locations', 'function', 'sector']
  return [i for i in types if i in txt][0]

def group_scaling(df_):
    df_['head_type'] = df_['head'].apply(add_type)
    df_['tail_type'] = df_['tail'].apply(add_type)
    groups = df_.groupby(['head_type', 'rel', 'tail_type']).agg({'proba': [ 'min', 'max']}).reset_index()
    groups.columns = ['_'.join(tup).rstrip('_') for tup in groups.columns.values]
    new_df = pd.merge(df_, groups,  how='inner', left_on=['head_type','rel','tail_type'], right_on = ['head_type','rel','tail_type'])
    new_df['proba'] = (new_df['proba']-new_df['proba_min'])/(new_df['proba_max']-new_df['proba_min'])
    groups = new_df.groupby(['head_type', 'rel', 'tail_type']).agg({'proba': [ 'min', 'max']}).reset_index()
    print(groups)
    return new_df[['head','rel','tail','time_amin','time_amax','proba']]

In [112]:
triples_df_ = group_scaling(df_ = triples_df_.copy())

      head_type              rel    tail_type proba     
                                                min  max
0      coursera          _favors   job_titles   0.0  1.0
1      coursera        _provides  hard_skills   0.0  1.0
2      function              _by   job_titles   0.0  1.0
3      function  _co_occurs_with       sector   0.0  1.0
4   hard_skills     _acquired_by     coursera   0.0  1.0
5   hard_skills  _co_occurs_with  hard_skills   0.0  1.0
6   hard_skills  _co_occurs_with  soft_skills   0.0  1.0
7   hard_skills      _needed_for   job_titles   0.0  1.0
8    job_titles      _belongs_to   recruiters   0.0  1.0
9    job_titles      _belongs_to       sector   0.0  1.0
10   job_titles             _has     function   0.0  1.0
11   job_titles      _is_located    locations   0.0  1.0
12   job_titles        _requires     coursera   0.0  1.0
13   job_titles        _requires  hard_skills   0.0  1.0
14   job_titles        _requires  soft_skills   0.0  1.0
15    locations         _locate

In [113]:
# triples_df_ = proba_scaling(df_ = triples_df_)

In [114]:
triples_df_.to_csv(join(KG_PATH, 'triples', 'kg_triples_withjob_course_links_scaled.csv'), index = False)

## Tests

In [115]:
triples_df_[triples_df_['head'].str.startswith("job_titles") & triples_df_['tail'].str.startswith("hard_skills")]

Unnamed: 0,head,rel,tail,time_amin,time_amax,proba
42317,job_titles_0,_requires,hard_skills_31447,2017-05-06,2017-05-06,0.109408
42318,job_titles_1,_requires,hard_skills_32202,2006-02-02,2006-02-02,0.000804
42319,job_titles_10,_requires,hard_skills_103,2008-09-19,2008-09-19,0.012414
42320,job_titles_10,_requires,hard_skills_1090,2007-06-26,2007-06-26,0.006263
42321,job_titles_10,_requires,hard_skills_11243,2008-12-22,2008-12-22,0.006288
...,...,...,...,...,...,...
60028,job_titles_997,_requires,hard_skills_33201,2012-05-16,2012-05-16,0.070715
60029,job_titles_997,_requires,hard_skills_463,2009-01-12,2009-01-12,0.159355
60030,job_titles_997,_requires,hard_skills_6117,2010-10-18,2010-10-18,0.086255
60031,job_titles_998,_requires,hard_skills_31229,2007-06-07,2007-06-07,0.007440


In [None]:
triples_df_[triples_df_['head'].str.startswith("coursera") & triples_df_['tail'].str.startswith("hard_skills")]

Unnamed: 0,head,tail,rel,time_amin,time_amax,proba
0,coursera_--626KkxEeywagovoAKHOQ,hard_skills_7231,_provides,2005-01-01,2005-01-01,0.026316
1,coursera_-0BI9jXyEeWa2g6sjqf03Q,hard_skills_8402,_provides,2005-01-01,2005-01-01,0.001786
2,coursera_-0wI4W8lEeys9RJMWW48Yw,hard_skills_7085,_provides,2005-01-01,2005-01-01,0.013514
3,coursera_-1YwAnTLEeSjmyIAC0aXFg,hard_skills_8466,_provides,2005-01-01,2005-01-01,0.010638
4,coursera_-1cp1vgjEeyxiRKaoDccyw,hard_skills_7256,_provides,2005-01-01,2005-01-01,0.021739
...,...,...,...,...,...,...
7632,coursera_zwVdMU1HEeWxCwowHhzTpQ,hard_skills_7742,_provides,2005-01-01,2005-01-01,0.000204
7633,coursera_zwqURwiiEeahpQ5cgS1w4w,hard_skills_7742,_provides,2005-01-01,2005-01-01,0.000204
7634,coursera_zxPc1BgjEeyf4Ar-aygVCw,hard_skills_7233,_provides,2005-01-01,2005-01-01,0.000067
7635,coursera_zyGo9ph2EeSyKiIAC0EFDA,hard_skills_7742,_provides,2005-01-01,2005-01-01,0.000204


In [None]:
triples_g = get_graph(triples_df_)

In [None]:
triples_g.edges()

EdgeView([('coursera_--626KkxEeywagovoAKHOQ', 'hard_skills_7231'), ('hard_skills_7231', 'coursera_0mUWawwqEei4Vw5pGASqjA'), ('hard_skills_7231', 'coursera_0xaPTakzEeywagovoAKHOQ'), ('hard_skills_7231', 'coursera_CDDg56kzEeyjPxJkzIUqCw'), ('hard_skills_7231', 'coursera_DCszoPViEeuEcw6azz3PZQ'), ('hard_skills_7231', 'coursera_DHjhLqkyEeyxDg4ukgkVlw'), ('hard_skills_7231', 'coursera_FsaCo6kyEeywagovoAKHOQ'), ('hard_skills_7231', 'coursera_IsitRw2fEeu7DA5RF89taw'), ('hard_skills_7231', 'coursera_JmnOo6kyEeyjPxJkzIUqCw'), ('hard_skills_7231', 'coursera_MibxOgwrEeiVnRKrXPCsoA'), ('hard_skills_7231', 'coursera_SsqDWjm8EeetnwoV3WhjXA'), ('hard_skills_7231', 'coursera_X14bwAwrEeihtBJn6CjaCg'), ('hard_skills_7231', 'coursera_bDITwakzEeyxDg4ukgkVlw'), ('hard_skills_7231', 'coursera_baumlTm8Eeeahg4SvlLy9A'), ('hard_skills_7231', 'coursera_l6TdtQ2eEeuJnRIpMpc7BQ'), ('hard_skills_7231', 'coursera_m9Rh-EPaEeqEIAr9EHdN-Q'), ('hard_skills_7231', 'coursera_xp-FcDm7EeeN-xLrqutC-A'), ('hard_skills_7231', 

In [None]:
g_neigh = nx.all_neighbors(triples_g)

TypeError: ignored

In [None]:
coefs_calc(G = triples_g)

{'hard_skills_7231'} {'coursera_m9Rh-EPaEeqEIAr9EHdN-Q', 'coursera_DCszoPViEeuEcw6azz3PZQ', 'coursera_X14bwAwrEeihtBJn6CjaCg', 'coursera_l6TdtQ2eEeuJnRIpMpc7BQ', 'coursera_MibxOgwrEeiVnRKrXPCsoA', 'coursera_--626KkxEeywagovoAKHOQ', 'coursera_CDDg56kzEeyjPxJkzIUqCw', 'coursera_bDITwakzEeyxDg4ukgkVlw', 'coursera_baumlTm8Eeeahg4SvlLy9A', 'coursera_xp-FcDm7EeeN-xLrqutC-A', 'coursera_zbLyb0L5Ee2Yjw7oxhTzHw', 'coursera_0xaPTakzEeywagovoAKHOQ', 'coursera_JmnOo6kyEeyjPxJkzIUqCw', 'coursera_yPClEg2eEeu9ZBLzQTJEhw', 'coursera_IsitRw2fEeu7DA5RF89taw', 'coursera_DHjhLqkyEeyxDg4ukgkVlw', 'coursera_FsaCo6kyEeywagovoAKHOQ', 'coursera_0mUWawwqEei4Vw5pGASqjA', 'coursera_SsqDWjm8EeetnwoV3WhjXA'}




(0.0, 0.0, 0.0, inf)