# Organization Relationships Analysis

In [1]:
#!pip install numpy


In [14]:
#!pip install pymupdf

In [17]:
import fitz  # PyMuPDF
import spacy
import pandas as pd
import os
import json
from tqdm import tqdm
import networkx as nx
from pyvis.network import Network
from transformers import pipeline

In [19]:
# Load spaCy model for named entity recognition (NER)
nlp = spacy.load("en_core_web_sm")

# Load RoBERTa sentiment analysis model
sentiment_pipeline = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment-latest")


# Configuration
CONFIG = {
    "paths": {
        "pdf_dir": "./pdfs",  # Directory containing PDFs
        "output_json": "./processed/organizations-relationships.csv",  # Output JSON file
    }
}

config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use mps:0


In [49]:
# Function to extract text from PDFs
def extract_text_from_pdfs(pdf_dir):
    texts = []
    file_sources = []
    for filename in os.listdir(pdf_dir):
        if filename.endswith(".pdf"):
            filepath = os.path.join(pdf_dir, filename)
            with fitz.open(filepath) as doc:
                text = "\n".join([page.get_text("text") for page in doc])
                texts.append((text, filename))  # Store text with filename
    return texts

# Function to extract organization names and relationships
def extract_relationships(text, filename):
    doc = nlp(text)
    relationships = []
    full_relationships = []  # Store extra details separately
    
    for sent in doc.sents:
        orgs = [ent.text for ent in sent.ents if ent.label_ == "ORG"]
        
        if len(orgs) >= 2:
            sentiment_result = sentiment_pipeline(sent.text[:512])[0]["label"]
            sentiment_label = "neutral" if sentiment_result == "neutral" else sentiment_result
            
            # Store only (org1, org2) for visualization
            relationships.append((orgs[0], orgs[1]))

            # Store full details separately
            full_relationships.append([
                orgs[0], orgs[1], sent.text.strip(), sentiment_label, filename
            ])

    return relationships, full_relationships


# Function to save relationships as CSV
def save_relationships_to_csv(relationships, output_path):
    df = pd.DataFrame(relationships, columns=["source", "target", "sentence", "sentiment", "source_file"])
    df.to_csv(output_path, index=False)

# Function to visualize organization relationships
def visualize_relationships(relationships):
    G = nx.Graph()
    for rel in relationships:
        if isinstance(rel, (list, tuple)) and len(rel) == 2:
            org1, org2 = rel
            G.add_edge(org1, org2)
        else:
            print(f"Skipping invalid relationship: {rel}")  # Debugging statement
    net = Network(notebook=True, directed=False)
    net.from_nx(G)
    net.show("organization_network.html")


In [55]:
all_relationships = []
detailed_relationships = []

for text, filename in texts_with_sources:
    simple_rels, full_rels = extract_relationships(text, filename)  # ✅ Unpack correctly

    for rel in simple_rels:  # ✅ Only process (org1, org2) pairs
        if not isinstance(rel, (list, tuple)) or len(rel) != 2:
            print(f"Invalid relationship found and skipped: {rel}")
    
    all_relationships.extend(simple_rels)  # ✅ Append only valid (org1, org2) pairs
    detailed_relationships.extend(full_rels)  # ✅ Store full details separately

# Save full relationships to CSV if needed
## save_relationships_to_csv(detailed_relationships, CONFIG["paths"]["output_csv"])

visualize_relationships(all_relationships)



organization_network.html


In [56]:
###all_relationships