# ConceptNet

version 5.7

### Setting imports, constants, and paths

In [1]:
import pandas as pd
import networkx as nx
import json
from collections import defaultdict
import copy
import os

import conceptnet_uri as cn

import config

In [2]:
VERSION=config.VERSION

NODE_COLS=config.nodes_cols
EDGE_COLS=config.edges_cols

MOWGLI_NS=config.mowgli_ns

POS_MAPPING=config.pos_mapping

POS_REL=config.has_pos
POS_FORM_REL=config.has_pos_form
IS_POS_FORM_OF_REL=config.is_pos_form_of
WORDNET_SENSE_REL=config.wordnet_sense

CUSTOM_DATASET=config.custom_dataset

data_source=config.cn_ds

In [3]:
print_every=500000

In [4]:
cn_path='../input/conceptnet/conceptnet-en.csv'
# OUTPUT FILES
output_dir='../output_v%s/conceptnet' % VERSION
nodes_file='%s/nodes_v%s.csv' % (output_dir, VERSION)
edges_file='%s/edges_raw_v%s.csv' % (output_dir, VERSION)
edges_enriched_file='%s/edges_enriched_v%s.csv' % (output_dir, VERSION)
edges_full_file='%s/edges_v%s.csv' % (output_dir, VERSION)

In [5]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

### Load the data in pandas

In [6]:
df=pd.read_csv(cn_path, sep='\t', header=None, converters={4: json.loads})

In [7]:
df.columns=['assertion','rel','subj','obj','metadata']

In [8]:
df.drop(columns=['assertion'])

Unnamed: 0,rel,subj,obj,metadata
0,/r/Antonym,/c/en/0/n,/c/en/1,"{'dataset': '/d/wiktionary/fr', 'license': 'cc..."
1,/r/Antonym,/c/en/12_hour_clock/n,/c/en/24_hour_clock,"{'dataset': '/d/wiktionary/en', 'license': 'cc..."
2,/r/Antonym,/c/en/24_hour_clock/n,/c/en/12_hour_clock,"{'dataset': '/d/wiktionary/en', 'license': 'cc..."
3,/r/Antonym,/c/en/5/n,/c/en/3,"{'dataset': '/d/wiktionary/en', 'license': 'cc..."
4,/r/Antonym,/c/en/a.c/n,/c/en/d.c,"{'dataset': '/d/wiktionary/fr', 'license': 'cc..."
...,...,...,...,...
3410694,/r/UsedFor,/c/en/zoom_lens,/c/en/procure_better_shot,"{'dataset': '/d/conceptnet/4/en', 'license': '..."
3410695,/r/UsedFor,/c/en/zoom_lens,/c/en/see_things_bigger,"{'dataset': '/d/conceptnet/4/en', 'license': '..."
3410696,/r/UsedFor,/c/en/zoom_lens,/c/en/seeing_distant_object_more_closely,"{'dataset': '/d/conceptnet/4/en', 'license': '..."
3410697,/r/UsedFor,/c/en/zoom_lens,/c/en/take_pictures,"{'dataset': '/d/conceptnet/4/en', 'license': '..."


In [9]:
df['metadata'][0]['dataset']

'/d/wiktionary/fr'

### Create nodes.csv and edges.csv

Let's first extract the main data into temporary structures.

In [10]:
node_datasets=defaultdict(set)
all_edges=[]

for i, row in df.iterrows():
    
    subj=row['subj']
    obj=row['obj']
    rel=row['rel']
    dataset=row['metadata']['dataset']
    weight=row['metadata']['weight']
    sentence=''
    
    node_datasets[subj].add(dataset)
    node_datasets[obj].add(dataset)
    
    other={'dataset': dataset}
    edge_data=[subj, rel, obj, data_source, weight, other]
    all_edges.append(edge_data)
    
    if (i%print_every==0): print('processed row', i)

processed row 0
processed row 500000
processed row 1000000
processed row 1500000
processed row 2000000
processed row 2500000
processed row 3000000


In [11]:
print(len(node_datasets))
print(len(all_edges))

1787272
3410699


In [12]:
def create_relation(ns, rel):
    return '%s:%s' % (ns, rel)

In [13]:
def get_pos_tag(uri):
    components=cn.split_uri(uri)
    if len(components)<4:
        return '', ''
    else:
        raw_pos=components[3]
        mapped_pos=create_relation(MOWGLI_NS, POS_MAPPING[raw_pos])
        return mapped_pos, raw_pos

#### a. Prepare and store nodes

In [14]:
all_nodes=[]
for n, datasets in node_datasets.items():
    label=cn.uri_to_label(n)
    aliases_list=[]
    aliases=','.join(aliases_list)
    mapped_pos, raw_pos=get_pos_tag(n)
    other={'datasets': list(datasets)}
    col=[n, label, aliases, raw_pos, data_source, other]
    all_nodes.append(col)
    
for raw_pos, mapped_pos in POS_MAPPING.items():
    mowgli_pos=create_relation(MOWGLI_NS, mapped_pos)
    col=[mowgli_pos, raw_pos, mapped_pos, '', '', {"datasets": [CUSTOM_DATASET]}]
    all_nodes.append(col)

In [15]:
len(all_nodes)

1787276

In [16]:
nodes_df = pd.DataFrame(all_nodes, columns = NODE_COLS)

In [17]:
nodes_df['pos'].unique()

array(['n', '', 'r', 'a', 'v'], dtype=object)

In [18]:
nodes_df.sort_values('id').to_csv(nodes_file, index=False, sep='\t')

#### b. Enrich and store edges

In [19]:
edges_df = pd.DataFrame(all_edges, columns = EDGE_COLS)
edges_df.sort_values(by=['subject', 'predicate','object']).to_csv(edges_file, index=False, sep='\t')

In [20]:
all_edges_enriched=copy.deepcopy(all_edges)
other={'dataset': CUSTOM_DATASET}
for i, row in nodes_df.iterrows():
    
    node_id=row['id']
    components=cn.split_uri(node_id)
    
    if len(components)==4:
        # add POS relations
        mapped_pos, raw_pos = get_pos_tag(node_id)
        edge=[node_id, create_relation(MOWGLI_NS, POS_REL), mapped_pos, data_source, "1.0", other]
        all_edges_enriched.append(edge)
        
        le_node='/%s' % '/'.join(components[:3])
        if le_node in node_datasets.keys():
            # add pos-form relations (both-ways)
            edge=[le_node, create_relation(MOWGLI_NS, POS_FORM_REL), node_id, data_source, "1.0", other]
            all_edges_enriched.append(edge)

            edge=[node_id, create_relation(MOWGLI_NS, IS_POS_FORM_OF_REL), le_node, data_source, "1.0", other]
            all_edges_enriched.append(edge)
        
    elif len(components)>=5 and components[4]=='wn':
        # add OMW relations
        pos_node='/%s' % '/'.join(components[:4])
        if pos_node in node_datasets.keys():
            edge=[pos_node, create_relation(MOWGLI_NS, WORDNET_SENSE_REL), node_id, data_source, "1.0", other]
            all_edges_enriched.append(edge)
    
    if (i%print_every==0): print('processed row', i)

processed row 0
processed row 500000
processed row 1000000
processed row 1500000


In [21]:
edges_enriched_df = pd.DataFrame(all_edges_enriched, columns = EDGE_COLS)
edges_enriched_df.sort_values(by=['subject', 'predicate','object']).to_csv(edges_enriched_file, 
                                                                           index=False, 
                                                                           sep='\t')

In [22]:
len(all_edges)

3410699

In [23]:
len(all_edges_enriched)

5282888

#### c. Complement missing symmetric data

In [34]:
all_difs=[edges_enriched_df]
for sym_rel in config.symmetric_rels:
    #if sym_rel!='/r/LocatedNear': continue
        
    sub_df=edges_enriched_df[edges_enriched_df.predicate==sym_rel]
    sub_df['other']=""
    print(sym_rel, len(sub_df))
    
    so_df=sub_df[EDGE_COLS]
    
    os_df=sub_df[['object', 'predicate', 'subject', 'datasource', 'weight', 'other']]
    os_df.columns=EDGE_COLS
    
    the_diff=os_df.merge(so_df,indicator = True, 
                         how='left').loc[lambda x : x['_merge']!='both']
    
    the_diff['other']=json.dumps({'dataset': CUSTOM_DATASET})
    the_diff['other']=the_diff['other'].apply(json.loads)
    
    print(the_diff.columns)
    
    print(len(the_diff))
    print()
    all_difs.append(the_diff)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


/r/Antonym 19066
Index(['subject', 'predicate', 'object', 'datasource', 'weight', 'other',
       '_merge'],
      dtype='object')
18788

/r/DistinctFrom 3315
Index(['subject', 'predicate', 'object', 'datasource', 'weight', 'other',
       '_merge'],
      dtype='object')
3251

/r/EtymologicallyRelatedTo 32075
Index(['subject', 'predicate', 'object', 'datasource', 'weight', 'other',
       '_merge'],
      dtype='object')
29999

/r/LocatedNear 49
Index(['subject', 'predicate', 'object', 'datasource', 'weight', 'other',
       '_merge'],
      dtype='object')
49

/r/RelatedTo 1703582
Index(['subject', 'predicate', 'object', 'datasource', 'weight', 'other',
       '_merge'],
      dtype='object')
1690482

/r/SimilarTo 30280
Index(['subject', 'predicate', 'object', 'datasource', 'weight', 'other',
       '_merge'],
      dtype='object')
8830

/r/Synonym 222156
Index(['subject', 'predicate', 'object', 'datasource', 'weight', 'other',
       '_merge'],
      dtype='object')
177035



In [35]:
all_data=pd.concat(all_difs)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [36]:
all_data=all_data[EDGE_COLS]
all_data.columns

Index(['subject', 'predicate', 'object', 'datasource', 'weight', 'other'], dtype='object')

In [37]:
all_data.sort_values(by=['subject', 'predicate','object']).to_csv(edges_full_file, 
                                                                  index=False, 
                                                                  sep='\t')

In [38]:
len(all_data)

7211322