# ConceptNet

version 5.7

### Setting imports and paths

In [3]:
import conceptnet_uri as cn
import pandas as pd
import networkx as nx

import sys
sys.path.insert(0,'..')
import config

In [4]:
cn_path='data/conceptnet-en.csv'

### Load the data in pandas

In [5]:
df=pd.read_csv(cn_path, sep='\t', header=None)

In [6]:
df.columns=['assertion','rel','subj','obj','metadata']

In [7]:
df.drop(columns=['assertion'])

Unnamed: 0,rel,subj,obj,metadata
0,/r/Antonym,/c/en/0/n,/c/en/1,"{""dataset"": ""/d/wiktionary/fr"", ""license"": ""cc..."
1,/r/Antonym,/c/en/12_hour_clock/n,/c/en/24_hour_clock,"{""dataset"": ""/d/wiktionary/en"", ""license"": ""cc..."
2,/r/Antonym,/c/en/24_hour_clock/n,/c/en/12_hour_clock,"{""dataset"": ""/d/wiktionary/en"", ""license"": ""cc..."
3,/r/Antonym,/c/en/5/n,/c/en/3,"{""dataset"": ""/d/wiktionary/en"", ""license"": ""cc..."
4,/r/Antonym,/c/en/a.c/n,/c/en/d.c,"{""dataset"": ""/d/wiktionary/fr"", ""license"": ""cc..."
...,...,...,...,...
3410694,/r/UsedFor,/c/en/zoom_lens,/c/en/procure_better_shot,"{""dataset"": ""/d/conceptnet/4/en"", ""license"": ""..."
3410695,/r/UsedFor,/c/en/zoom_lens,/c/en/see_things_bigger,"{""dataset"": ""/d/conceptnet/4/en"", ""license"": ""..."
3410696,/r/UsedFor,/c/en/zoom_lens,/c/en/seeing_distant_object_more_closely,"{""dataset"": ""/d/conceptnet/4/en"", ""license"": ""..."
3410697,/r/UsedFor,/c/en/zoom_lens,/c/en/take_pictures,"{""dataset"": ""/d/conceptnet/4/en"", ""license"": ""..."


In [8]:
df.head()

Unnamed: 0,assertion,rel,subj,obj,metadata
0,"/a/[/r/Antonym/,/c/en/0/n/,/c/en/1/]",/r/Antonym,/c/en/0/n,/c/en/1,"{""dataset"": ""/d/wiktionary/fr"", ""license"": ""cc..."
1,"/a/[/r/Antonym/,/c/en/12_hour_clock/n/,/c/en/2...",/r/Antonym,/c/en/12_hour_clock/n,/c/en/24_hour_clock,"{""dataset"": ""/d/wiktionary/en"", ""license"": ""cc..."
2,"/a/[/r/Antonym/,/c/en/24_hour_clock/n/,/c/en/1...",/r/Antonym,/c/en/24_hour_clock/n,/c/en/12_hour_clock,"{""dataset"": ""/d/wiktionary/en"", ""license"": ""cc..."
3,"/a/[/r/Antonym/,/c/en/5/n/,/c/en/3/]",/r/Antonym,/c/en/5/n,/c/en/3,"{""dataset"": ""/d/wiktionary/en"", ""license"": ""cc..."
4,"/a/[/r/Antonym/,/c/en/a.c/n/,/c/en/d.c/]",/r/Antonym,/c/en/a.c/n,/c/en/d.c,"{""dataset"": ""/d/wiktionary/fr"", ""license"": ""cc..."


In [9]:
len(df)

3410699

In [10]:
df.describe()

Unnamed: 0,assertion,rel,subj,obj,metadata
count,3410699,3410699,3410699,3410699,3410699
unique,3410699,37,1410294,712125,604487
top,"/a/[/r/RelatedTo/,/c/en/postface/n/,/c/en/back/]",/r/RelatedTo,/c/en/person,/c/en/slang,"{""dataset"": ""/d/wiktionary/en"", ""license"": ""cc..."
freq,1,1703582,6339,10999,2443411


**Strange: 48 relations**. Let's inspect:

In [11]:
df['rel'].unique()

array(['/r/Antonym', '/r/AtLocation', '/r/CapableOf', '/r/Causes',
       '/r/CausesDesire', '/r/CreatedBy', '/r/DefinedAs',
       '/r/DerivedFrom', '/r/Desires', '/r/DistinctFrom', '/r/Entails',
       '/r/EtymologicallyDerivedFrom', '/r/EtymologicallyRelatedTo',
       '/r/FormOf', '/r/HasA', '/r/HasContext', '/r/HasFirstSubevent',
       '/r/HasLastSubevent', '/r/HasPrerequisite', '/r/HasProperty',
       '/r/HasSubevent', '/r/InstanceOf', '/r/IsA', '/r/LocatedNear',
       '/r/MadeOf', '/r/MannerOf', '/r/MotivatedByGoal',
       '/r/NotCapableOf', '/r/NotDesires', '/r/NotHasProperty',
       '/r/PartOf', '/r/ReceivesAction', '/r/RelatedTo', '/r/SimilarTo',
       '/r/SymbolOf', '/r/Synonym', '/r/UsedFor'], dtype=object)

### SQL-like queries

In [36]:
all_difs=[]
for sym_rel in config.symmetric_rels:
    #if sym_rel!='/r/LocatedNear': continue
    
        
    sub_df=df[df.rel==sym_rel]
    print(sym_rel, len(sub_df))
    
    so_df=sub_df[['subj', 'rel', 'obj', 'metadata']]

#    print(so_df)
    
    os_df=sub_df[['obj', 'rel', 'subj', 'metadata']]
    os_df.columns=['subj', 'rel', 'obj', 'metadata']
    
    the_diff=os_df.merge(so_df,indicator = True, 
                         how='left').loc[lambda x : x['_merge']!='both']

    
    
    print(len(the_diff))
    print()
    all_difs.append(the_diff)

/r/Antonym 19066
18794

/r/DistinctFrom 3315
3263

/r/EtymologicallyRelatedTo 32075
29999

/r/LocatedNear 49
49

/r/RelatedTo 1703582
1692996

/r/SimilarTo 30280
30066

/r/Synonym 222156
177621



In [37]:
additional=pd.concat(all_difs)

In [38]:
len(additional)

1952788

### Load as a graph

In [None]:
G=nx.from_pandas_edgelist(df, 'subj', 'obj', ['rel'], create_using=nx.DiGraph)

### Average degree

In [None]:
sum(dict(G.degree()).values())/float(len(G))

### Neighbors of some node

In [None]:
str_node='/c/en/man'
con_node='/c/en/man/n'
#con_node='/c/en/batman/n/wikt/en_2'

Inspect the string or the concept node?

In [None]:
the_node=con_node

In [None]:
G.has_edge(str_node, con_node)

In [None]:
G.has_edge(con_node, str_node)

In [None]:
G.succ[the_node] # successors of this node

In [None]:
G.pred[the_node]# predecessors of this node

### Save to nodes and edges output files

1. save edges (=basically, the entire pandas dataframe):

In [None]:
edges_file='output/edges.csv'

In [None]:
df.sort_values(by=['subj', 'rel', 'obj', 'metadata']).to_csv(edges_file, index=False, sep='\t', columns=['subj', 'rel', 'obj', 'metadata'])

2. save nodes (=basically the union of the unique values of columns subject and object

In [None]:
nodes_file='output/nodes.csv'

In [None]:
uniq_s=df['subj'].drop_duplicates()
uniq_o=df['obj'].drop_duplicates()
uniq_nodes = pd.concat([uniq_s,uniq_o]).drop_duplicates()

In [None]:
len(uniq_nodes)

In [None]:
uniq_nodes.head()

In [None]:
uniq_nodes.sort_values().to_csv(nodes_file, index=False, sep='\t')