Loading libraries

In [None]:
!pip install spacy
!python -m spacy download ru_core_news_sm
!pip install neo4j

In [None]:
import pandas as pd
import spacy
import shutil
from neo4j import GraphDatabase

Taking file with the extracted terms

In [None]:
df = pd.read_csv('extracted_terms_with_links.csv')
df

Vocabulary of keywords for Klink is the extracted terms

In [None]:
vocab = df['term']

Preprocessing news data via spacy

In [None]:
ten_years = pd.read_csv('ten_years_with_good_format.csv')
ten_years.fillna('', inplace = True)

In [None]:
nlp = spacy.load('ru_core_news_sm')

In [None]:
lemmatized_full_news = []

for index, row in ten_years.iterrows():
    doc = nlp(str(row['full_news']))

    lemmatized = ' '.join(token.lemma_ for token in doc)

    lemmatized_full_news.append(lemmatized)

Additionally saving the preprocessed data

In [None]:
full_lemmatized = pd.DataFrame(lemmatized_full_news, columns = ['lemm_full_news'])

full_lemmatized['date'] = ten_years['date']
full_lemmatized['organizations'] = ten_years['organizations']
full_lemmatized['persons'] = ten_years['persons']

full_lemmatized.to_csv('all_news_lemmatized.csv', index = False)

Importing modules and creating the ontology

In [None]:
from ontology_module import create_ontology
from ontology_module import visualise_ontology

In [None]:
created = create_ontology(full_lemmatized['lemm_full_news'], vocab, 20)
written = visualise_ontology(created, True, 20)

Function to save ontology into two csv files: one for nodes and one for edges

In [None]:
def graph_to_csv(vocab, node_file, edge_file):

    all_edges = []
    all_nodes = set()

    for word in vocab.keys():
        for i in vocab[word]:
            all_edges.append([i[0], word, i[1]['weight']])
            all_nodes.add(i[0])
        all_nodes.add(word)

    nodes = pd.DataFrame(all_nodes, columns = ['Word'])
    edges = pd.DataFrame(all_edges, columns = ['Word', 'Subtopic_of', 'Weight'])

    nodes.to_csv(node_file, index = False)
    edges.to_csv(edge_file, index = False)

Saving the ontology

In [None]:
node_name = 'klink_lenta' + '_nodes.csv'
edge_name = 'klink_lenta' + '_edges.csv'

graph_to_csv(written, node_name, edge_name)

Uploading the ontology into Neo4j

In [None]:
#give necessary directories where the edges file is located and where it is
#supposed to go
src = r"C:\Users\anutk\Project2024\Klink-2_and_Lenta\klink_lenta_edges.csv"
dst = r"C:\Users\anutk\.Neo4jDesktop\relate-data\dbmss\dbms-f12510d2-d551-4266-b729-ac22ffeb04b4\import\klink_lenta_edges.csv"

shutil.copyfile(src, dst)

In [None]:
#give necessary directories where the nodes file is located and where it is
#supposed to go
src = r"C:\Users\anutk\Project2024\Klink-2_and_Lenta\klink_lenta_nodes.csv"
dst = r"C:\Users\anutk\.Neo4jDesktop\relate-data\dbmss\dbms-f12510d2-d551-4266-b729-ac22ffeb04b4\import\klink_lenta_nodes.csv"

shutil.copyfile(src, dst)

Class for establishing a connection with Neo4j server

 [Source code]((https://habr.com/ru/articles/650623/)

In [None]:
class Neo4jConnection:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        if self.driver is not None:
            self.driver.close()

    def query(self, query, db=None):
        assert self.driver is not None, "Driver not initialized!"
        session = None
        response = None
        try:
            session = self.driver.session(database=db) if db is not None else self.driver.session()
            response = list(session.run(query))
        except Exception as e:
            print("Query failed:", e)
        finally:
            if session is not None:
                session.close()
        return response

Creating the database and uploading the ontology

In [None]:
conn = Neo4jConnection(uri="bolt://localhost:7687", user="neo4j", password="12345678")
conn.query("CREATE OR REPLACE DATABASE testDb")

In [None]:
query_string = '''
LOAD CSV WITH HEADERS FROM 'file:///klink_lenta_nodes.csv' AS line FIELDTERMINATOR ','
CREATE (word:Word {word: line.Word});
'''
conn.query(query_string, db='testDb')

In [None]:
query_string = '''
LOAD CSV WITH HEADERS FROM 'file:///klink_lenta_edges.csv' AS line FIELDTERMINATOR ','
MATCH (word1:Word {word: line.Word}), (word2:Word {word: line.Subtopic_of})
MERGE (word1)<-[:SUBTOPIC]-(word2);
'''
conn.query(query_string, db='testDb')