# Organization Relationships Analysis

In [None]:
#!pip install numpy
#!pip install openai spacy requests pandas networkx pyvis fitz

In [None]:
#!pip install pymupdf

In [1]:
import os
import fitz  # PyMuPDF
import openai
import spacy
import pandas as pd
import networkx as nx
from pyvis.network import Network
import requests

In [None]:
# Load spaCy model for named entity recognition (NER)
#nlp = spacy.load("en_core_web_sm")

# Load RoBERTa sentiment analysis model
#sentiment_pipeline = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment-latest")


# Configuration
#CONFIG = {
#    "paths": {
#        "pdf_dir": "./pdfs",  # Directory containing PDFs
#        "output_json": "./processed/organizations-relationships.csv",  # Output JSON file
#    }
#}

In [3]:
import os
import fitz  # PyMuPDF
import openai
import spacy
import pandas as pd
import networkx as nx
from pyvis.network import Network
import requests
from transformers import pipeline

# Load spaCy model for Named Entity Recognition (NER)
nlp = spacy.load("en_core_web_sm")

# Load Hugging Face sentiment analysis model
sentiment_pipeline = pipeline("sentiment-analysis")

# OpenAI API Key (Set this securely in your environment)
client = openai.OpenAI(api_key="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") 

# Function to summarize text using OpenAI GPT-4o-mini
def summarize_text(text):
    prompt = """Identify the main points in the article provided.
    Given these main points, find relationships involving entities of type Organization.
    \n\nArticle:\n""" + text[:4000]  # Truncate to avoid API limits

    try:
        completion = client.chat.completions.create(
            model="gpt-4o-mini",
            store=True,
            messages=[{"role": "user", "content": prompt}]
        )
        summary = completion.choices[0].message.content
        return summary.strip()
    except Exception as e:
        print(f"Error during summarization: {e}")
        return text  # Fallback to original text if API fails

# Function to normalize organization names using DBpedia Spotlight
def normalize_org_name(org_name):
    url = "https://api.dbpedia-spotlight.org/en/annotate"
    headers = {"Accept": "application/json"}
    params = {"text": org_name, "confidence": 0.5}
    
    try:
        response = requests.get(url, headers=headers, params=params)
        if response.status_code == 200:
            data = response.json()
            if "Resources" in data:
                return data["Resources"][0]["@URI"].split("/")[-1]  # Extract DBpedia title
    except Exception as e:
        print(f"Error normalizing {org_name}: {e}")

    return org_name  # Return original if not found

# Function to analyze sentiment of a sentence
def get_sentiment(sentence):
    """Analyze sentiment of a sentence using Transformers."""
    result = sentiment_pipeline(sentence)
    sentiment_label = result[0]['label']  # 'POSITIVE' or 'NEGATIVE'
    confidence = result[0]['score']

    # If confidence is low, classify as NEUTRAL
    if confidence < 0.7:
        return 'NEUTRAL'
    return sentiment_label

# Function to extract text from PDFs
def extract_text_from_pdfs(pdf_dir):
    texts = []
    for filename in os.listdir(pdf_dir):
        if filename.endswith(".pdf"):
            filepath = os.path.join(pdf_dir, filename)
            with fitz.open(filepath) as doc:
                text = "\n".join([page.get_text("text") for page in doc])
                
                # Summarize the extracted text before further processing
                summarized_text = summarize_text(text)
                
                texts.append((summarized_text, filename))  # Store summarized text with filename
    return texts

# Function to extract organization relationships with sentiment
def extract_organization_relationships(text, filename):
    """Extract organization entities and relationships from text."""
    doc = nlp(text)
    relationships = []

    # Create relationships between organizations in the same sentence
    for sent in doc.sents:
        sent_doc = nlp(sent.text)
        entities_in_sent = [normalize_org_name(e.text) for e in sent_doc.ents if e.label_ == "ORG"]
        entities_in_sent = [e for e in entities_in_sent if e]  # Remove None values

        if len(entities_in_sent) >= 2:
            sentiment = get_sentiment(sent.text)  # Get sentiment of the sentence
            for i in range(len(entities_in_sent) - 1):
                relationships.append({
                    'source': entities_in_sent[i],
                    'target': entities_in_sent[i + 1],
                    'sentence': sent.text,
                    'sentiment': sentiment,
                    'source_file': filename
                })

    return relationships

# Function to save relationships as CSV
def save_relationships_to_csv(relationships, output_path):
    df = pd.DataFrame(relationships, columns=["source", "target", "sentence", "sentiment", "source_file"])
    df.to_csv(output_path, index=False)

# Function to visualize organization relationships with sentiment-based colors
def visualize_relationships(relationships):
    """Visualize organization relationships using Pyvis with sentiment-based edge colors."""
    net = Network(height="750px", width="100%", bgcolor="#222222", font_color="white", notebook=True, cdn_resources='in_line')

    # Add nodes (organizations)
    organizations = set()
    for relation in relationships:
        organizations.add(relation['source'])
        organizations.add(relation['target'])

    for org in organizations:
        net.add_node(org, label=org, color="blue", size=15)

    # Add edges with sentiment-based color
    for relation in relationships:
        sentiment = relation['sentiment']
        if sentiment == "POSITIVE":
            edge_color = "green"
        elif sentiment == "NEGATIVE":
            edge_color = "red"
        else:
            edge_color = "gray"

        net.add_edge(relation['source'], relation['target'], width=2, color=edge_color, title=relation['sentence'])

    # Save and show visualization
    net.show("organization_network.html")

# Main script to process PDFs
pdf_dir = "/Users/benitaleonardi/Downloads/Datathon pdfs"
texts_with_sources = extract_text_from_pdfs(pdf_dir)

all_relationships = []

for text, filename in texts_with_sources:
    relationships = extract_organization_relationships(text, filename)
    all_relationships.extend(relationships)

# Save relationships to CSV
output_csv_path = "output_relationships.csv"
save_relationships_to_csv(all_relationships, output_csv_path)

# Visualize the relationships with sentiment-based edges
visualize_relationships(all_relationships)


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use mps:0


organization_network.html
