# Organization Relationships Analysis

In [1]:
#!pip install numpy


In [2]:
# pip install pymupdf

Collecting pymupdf
  Downloading pymupdf-1.25.2-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.2-cp39-abi3-win_amd64.whl (16.5 MB)
   ---------------------------------------- 0.0/16.5 MB ? eta -:--:--
   ---- ----------------------------------- 1.8/16.5 MB 12.6 MB/s eta 0:00:02
   -------- ------------------------------- 3.7/16.5 MB 11.5 MB/s eta 0:00:02
   ---------------- ----------------------- 6.8/16.5 MB 12.7 MB/s eta 0:00:01
   -------------------------- ------------- 11.0/16.5 MB 14.0 MB/s eta 0:00:01
   ---------------------------------------  16.5/16.5 MB 17.1 MB/s eta 0:00:01
   ---------------------------------------- 16.5/16.5 MB 16.5 MB/s eta 0:00:00
Installing collected packages: pymupdf
Successfully installed pymupdf-1.25.2
Note: you may need to restart the kernel to use updated packages.


In [2]:
import fitz  # PyMuPDF
import spacy
import pandas as pd
import os
import json
from tqdm import tqdm
import networkx as nx
from pyvis.network import Network
from transformers import pipeline


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\Davina\anaconda3\envs\smubia_datathon\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\Davina\anaconda3\envs\smubia_datathon\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\Users\Davina\anaconda3\envs\smubia_datathon\Lib\site-packages\ipykernel\kernelapp.py", line 739, in start
    s

In [3]:
# Load spaCy model for named entity recognition (NER)
nlp = spacy.load("en_core_web_sm")

# Load RoBERTa sentiment analysis model
sentiment_pipeline = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment-latest")


# Configuration
CONFIG = {
    "paths": {
        "pdf_dir": "./pdfs",  # Directory containing PDFs
        "output_json": "./processed/organizations-relationships.csv",  # Output JSON file
    }
}

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [4]:
# Function to extract text from PDFs
def extract_text_from_pdfs(pdf_dir):
    texts = []
    file_sources = []
    for filename in os.listdir(pdf_dir):
        if filename.endswith(".pdf"):
            filepath = os.path.join(pdf_dir, filename)
            with fitz.open(filepath) as doc:
                text = "\n".join([page.get_text("text") for page in doc])
                texts.append((text, filename))  # Store text with filename
    return texts

# Function to extract organization names and relationships
def extract_relationships(text, filename):
    doc = nlp(text)
    relationships = []
    
    for sent in doc.sents:  # Process sentence by sentence
        orgs = [ent.text for ent in sent.ents if ent.label_ == "ORG"]
        verbs = [token.lemma_ for token in sent if token.pos_ == "VERB"]
        
        if len(orgs) >= 2 and verbs:  # Ensure at least two organizations and a verb
            sentiment_result = sentiment_pipeline(sent.text[:512])[0]["label"]  # Process first 512 tokens
            sentiment_label = "neutral"
            if sentiment_result == "negative":
                sentiment_label = "negative"
            elif sentiment_result == "positive":
                sentiment_label = "positive"
            
            relationships.append([
                orgs[0],  # source organization
                orgs[1],  # target organization
                verbs[0],  # first verb found
                sent.text.strip(),  # full sentence
                sentiment_label,  # sentiment
                filename  # source file
            ])
    return relationships

# Function to save relationships as CSV
def save_relationships_to_csv(relationships, output_path):
    df = pd.DataFrame(relationships, columns=["source", "target", "verb", "sentence", "sentiment", "source_file"])
    df.to_csv(output_path, index=False)

# Function to visualize organization relationships
def visualize_relationships(relationships):
    G = nx.Graph()
    for org1, org2 in relationships:
        G.add_edge(org1, org2)
    net = Network(notebook=True, directed=False)
    net.from_nx(G)
    net.show("organization_network.html")

In [5]:
all_relationships = []
texts_with_sources = extract_text_from_pdfs(CONFIG["paths"]["pdf_dir"])

for text, filename in texts_with_sources:
    all_relationships.extend(extract_relationships(text, filename))

save_relationships_to_csv(all_relationships, CONFIG["paths"]["output_csv"])

RuntimeError: Numpy is not available