In [2]:
!pip install scipy
!pip install matplotlib


Defaulting to user installation because normal site-packages is not writeable


[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: C:\Users\AAKAS\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip



Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: C:\Users\AAKAS\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import json
from collections import defaultdict
from itertools import product
import numpy as np
from scipy.sparse import csr_matrix, lil_matrix
from py2neo import Graph, Node, Relationship
import matplotlib.pyplot as plt



# Setting up Connection to NEO4J

In [4]:
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "password123"

# File path

In [5]:
file_path = "train.json"


# Creating citation graph...

In [None]:
def create_neo4j_graph(file_path, uri="bolt://localhost:7687", user="neo4j", password="password"):
    """
    Create citation graph in Neo4j from JSON data
    Direction: paper -> CITES -> reference
    """
    # Connect to Neo4j
    graph = Graph(uri, auth=(user, password))
    
    # Clear existing graph
    graph.run("MATCH (n) DETACH DELETE n")
    
    # Create constraint for uniqueness
    graph.run("CREATE CONSTRAINT paper_id IF NOT EXISTS FOR (p:Paper) REQUIRE p.id IS UNIQUE")
    
    # Read and process JSON file
    papers_processed = 0
    with open(file_path, 'r') as file:
        tx = graph.begin()
        batch_size = 1000
        current_batch = 0
        
        for line in file:
            data = json.loads(line)
            paper_id = data['paper']
            references = data.get('reference', [])  # Using get() with default empty list
            
            # Create citing paper node (always create this, even with no references)
            citing_paper = Node("Paper", id=paper_id)
            tx.merge(citing_paper, "Paper", "id")
            
            # Create reference nodes and relationships only if references exist
            if references:
                for ref in references:
                    cited_paper = Node("Paper", id=ref)
                    tx.merge(cited_paper, "Paper", "id")
                    # Create directed CITES relationship: citing_paper -> cited_paper
                    cites = Relationship(citing_paper, "CITES", cited_paper)
                    tx.create(cites)
            
            current_batch += 1
            papers_processed += 1
            
            if current_batch >= batch_size:
                tx.commit()
                tx = graph.begin()
                current_batch = 0
                print(f"Processed {papers_processed} papers...")
        
        if current_batch > 0:
            tx.commit()
            
    print(f"Total papers processed: {papers_processed}")
    
    # Verify graph creation
    result = graph.run("MATCH (p:Paper) RETURN count(p) as paper_count").data()[0]
    print(f"Total nodes in graph: {result['paper_count']}")
    result = graph.run("MATCH ()-[r:CITES]->() RETURN count(r) as cite_count").data()[0]
    print(f"Total citation relationships: {result['cite_count']}")
create_neo4j_graph(file_path, NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD)


# Fetching Graph from Database

In [None]:

def get_graph_data_from_neo4j(uri="bolt://localhost:7687", user="neo4j", password="password123"):
    """
    Retrieve graph structure from Neo4j for SimRank calculation
    Returns dictionary mapping node -> list of predecessors (papers that cite this node)
    """
    graph = Graph(uri, auth=(user, password))
    
    # Get all citation relationships (note the direction: cited<-citing)         
    query = """
    MATCH (cited:Paper)<-[r:CITES]-(citing:Paper)
    RETURN cited.id as cited, collect(citing.id) as citing_papers
    """
    
    # Initialize predecessor dictionary
    pred_dict = {}
    
    # Get all nodes (including those without citations)
    all_nodes_query = "MATCH (p:Paper) RETURN p.id as id"
    for record in graph.run(all_nodes_query):
        pred_dict[record['id']] = []
    
    # Add citation relationships
    for record in graph.run(query):
        cited_id = record['cited']
        citing_papers = record['citing_papers']
        pred_dict[cited_id] = citing_papers
    
    return pred_dict
 


In [7]:
get_graph_data_from_neo4j(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD)

{'2963981420': ['2897268228',
  '2963163009',
  '2627042741',
  '2995428361',
  '2554302513',
  '2963363373',
  '2963576971',
  '2963545688',
  '2995281597',
  '2928560789',
  '2924515500',
  '2995629810',
  '2945335799',
  '2984140583',
  '2968558981',
  '2798603777',
  '2942998388',
  '2933543903',
  '2990170043',
  '2945047231',
  '2904607988',
  '2996874060',
  '2736953746',
  '2963521187',
  '2965862350',
  '2981609437',
  '2976540144',
  '2965100203',
  '2963910036',
  '2905272567',
  '2936927208',
  '2995463996',
  '2996663285',
  '2984618279',
  '3009921999',
  '2948135617',
  '2996019573'],
 '2963446712': ['2989808579',
  '2963155035',
  '2964081807',
  '2897268228',
  '2995428361',
  '2883320311',
  '2963782415',
  '2963420686',
  '2963996760',
  '2891438060',
  '2963857746',
  '2970778145',
  '2955488837',
  '2981383995',
  '2782417188',
  '2963073398',
  '2921569601',
  '2963585656',
  '2799034895',
  '2964067226',
  '2965166097',
  '2970828071',
  '2964125708',
  '28020949

# Checking SimRank Similarity

In [9]:
import networkx as nx

def compute_partial_simrank(graph, query_nodes, importance_factor, max_iterations=10, tolerance=1e-4):
    simrank = {node: {} for node in query_nodes}
    for q1 in query_nodes:
        for q2 in query_nodes:
            if q1 == q2:
                simrank[q1][q2] = 1.0  # Self-similarity is always 1
            else:
                simrank[q1][q2] = 0.0  # Initialize similarity to 0

    for _ in range(max_iterations):
        prev_simrank = {q1: simrank[q1].copy() for q1 in query_nodes}
        for q1 in query_nodes:
            for q2 in query_nodes:
                if q1 != q2:
                    q1_neighbors = set(graph.neighbors(q1)) & query_nodes
                    q2_neighbors = set(graph.neighbors(q2)) & query_nodes
                    if q1_neighbors and q2_neighbors:
                        simrank[q1][q2] = (
                            importance_factor
                            * __builtins__.sum(
                                prev_simrank[n1][n2]
                                for n1 in q1_neighbors
                                for n2 in q2_neighbors
                                if n1 in prev_simrank and n2 in prev_simrank[n1]
                            )
                            / (len(q1_neighbors) * len(q2_neighbors))
                        )

        # Check for convergence
        max_diff = __builtins__.max(
            __builtins__.abs(simrank[q1][q2] - prev_simrank[q1][q2])
            for q1 in query_nodes
            for q2 in query_nodes
        )
        if max_diff < tolerance:
            break

    return simrank



graph = nx.DiGraph(get_graph_data_from_neo4j(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD)) 

# Define query nodes and importance factor
query_nodes = {'2963981420', '2897268228'}
importance_factor = 0.8

# Compute SimRank for the query nodes
partial_simrank_results = compute_partial_simrank(graph, query_nodes, importance_factor)

# Display the results
print(partial_simrank_results)


{'2897268228': {'2897268228': 1.0, '2963981420': 0.0}, '2963981420': {'2897268228': 0.0, '2963981420': 1.0}}


In [30]:
# Define query nodes and importance factor
query_nodes = {'2963981420', '2897268228'}
importance_factor = 0.9

# Compute SimRank for the query nodes
partial_simrank_results = compute_partial_simrank(graph, query_nodes, importance_factor)

# Display the results
print(partial_simrank_results)

{'2897268228': {'2897268228': 1.0, '2963981420': 0.0}, '2963981420': {'2897268228': 0.0, '2963981420': 1.0}}


In [31]:
# Define query nodes and importance factor
query_nodes = {'2963981420', '2897268228'}
importance_factor = 0.7

# Compute SimRank for the query nodes
partial_simrank_results = compute_partial_simrank(graph, query_nodes, importance_factor)

# Display the results
print(partial_simrank_results)

{'2897268228': {'2897268228': 1.0, '2963981420': 0.0}, '2963981420': {'2897268228': 0.0, '2963981420': 1.0}}
