# Organization Relationships Analysis

In [None]:
#!pip install numpy
#!pip install openai spacy requests pandas networkx pyvis fitz

In [None]:
#!pip install pymupdf

In [14]:
import os
import fitz  # PyMuPDF
import spacy
import pandas as pd
import requests
import csv
import math
from collections import defaultdict
from transformers import pipeline
from pyvis.network import Network
import pycountry
import pycountry_convert as pc  # For mapping countries to continents
import openai

In [11]:
# Load spaCy model for Named Entity Recognition (NER) ~ Detects organization names in text.
nlp = spacy.load("en_core_web_sm") 

# Load Hugging Face sentiment analysis model ~ Analyzes the emotional tone of text.
sentiment_pipeline = pipeline("sentiment-analysis")

# OpenAI API Key (Set this securely in your environment)
client = openai.OpenAI(api_key="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") 

# Function to summarize text using OpenAI GPT-4o-mini
def summarize_text(text):
    prompt = """Identify the main points in the article provided.
    Given these main points, find relationships involving entities of type Organization.
    \n\nArticle:\n""" + text[:4000]  # Truncate to avoid API limits

    try:
        completion = client.chat.completions.create(
            model="gpt-4o-mini",
            store=True,
            messages=[{"role": "user", "content": prompt}]
        )
        summary = completion.choices[0].message.content
        return summary.strip()
    except Exception as e:
        print(f"Error during summarization: {e}")
        return text  # Fallback to original text if API fails

# Function to normalize organization names using DBpedia Spotlight
def normalize_org_name(org_name):
    url = "https://api.dbpedia-spotlight.org/en/annotate"
    headers = {"Accept": "application/json"}
    params = {"text": org_name, "confidence": 0.5}
    
    try:
        response = requests.get(url, headers=headers, params=params)
        if response.status_code == 200:
            data = response.json()
            if "Resources" in data:
                return data["Resources"][0]["@URI"].split("/")[-1]  # Extract DBpedia title
    except Exception as e:
        print(f"Error normalizing {org_name}: {e}")

    return org_name  # Return original if not found

# Function to analyze sentiment of a sentence
def get_sentiment(sentence):
    """Analyze sentiment of a sentence using Transformers."""
    result = sentiment_pipeline(sentence)
    sentiment_label = result[0]['label']  # 'POSITIVE' or 'NEGATIVE'
    confidence = result[0]['score']

    # If confidence is low, classify as NEUTRAL
    if confidence < 0.7:
        return 'NEUTRAL'
    return sentiment_label

# Function to extract and save summarized text from PDFs
def extract_text_from_pdfs(pdf_dir, output_csv="summarized_texts.csv"):
    texts = []
    
    with open(output_csv, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["filename", "summarized_text"])  # CSV Header

        for filename in os.listdir(pdf_dir):
            if filename.endswith(".pdf"):
                filepath = os.path.join(pdf_dir, filename)
                with fitz.open(filepath) as doc:
                    text = "\n".join([page.get_text("text") for page in doc])
                    
                    # Summarize the extracted text before further processing
                    summarized_text = summarize_text(text)
                    
                    # Save to CSV
                    writer.writerow([filename, summarized_text])
                    
                    # Store in list for further processing
                    texts.append((summarized_text, filename))  

    return texts  # Return summarized texts if needed


# Function to extract organization relationships with sentiment
def extract_organization_relationships(text, filename):
    """Extract organization entities and relationships from text."""
    doc = nlp(text)
    relationships = []

    # Create relationships between organizations in the same sentence
    for sent in doc.sents:
        sent_doc = nlp(sent.text)
        entities_in_sent = [normalize_org_name(e.text) for e in sent_doc.ents if e.label_ == "ORG"]
        entities_in_sent = [e for e in entities_in_sent if e]  # Remove None values

        if len(entities_in_sent) >= 2:
            sentiment = get_sentiment(sent.text)  # Get sentiment of the sentence
            for i in range(len(entities_in_sent) - 1):
                relationships.append({
                    'source': entities_in_sent[i],
                    'target': entities_in_sent[i + 1],
                    'sentence': sent.text,
                    'sentiment': sentiment,
                    'source_file': filename
                })

    return relationships

# Function to save relationships as CSV
def save_relationships_to_csv(relationships, output_path):
    df = pd.DataFrame(relationships, columns=["source", "target", "sentence", "sentiment", "source_file"])
    df.to_csv(output_path, index=False)

# Function to visualize organization relationships with sentiment-based colors
def visualize_relationships(relationships):
    """Visualize organization relationships using Pyvis with sentiment-based edge colors."""
    net = Network(height="750px", width="100%", bgcolor="#222222", font_color="white", notebook=True, cdn_resources='in_line')

    # Add nodes (organizations)
    organizations = set()
    for relation in relationships:
        organizations.add(relation['source'])
        organizations.add(relation['target'])

    for org in organizations:
        net.add_node(org, label=org, color="blue", size=15)

    # Add edges with sentiment-based color
    for relation in relationships:
        sentiment = relation['sentiment']
        if sentiment == "POSITIVE":
            edge_color = "green"
        elif sentiment == "NEGATIVE":
            edge_color = "red"
        else:
            edge_color = "gray"

        net.add_edge(relation['source'], relation['target'], width=2, color=edge_color, title=relation['sentence'])

    # Save and show visualization
    net.show("organization_network.html")

# Main script to process PDFs
pdf_dir = "/Users/benitaleonardi/Downloads/Datathon pdfs"
texts_with_sources = extract_text_from_pdfs(pdf_dir)

all_relationships = []

for text, filename in texts_with_sources:
    relationships = extract_organization_relationships(text, filename)
    all_relationships.extend(relationships)

# Save relationships to CSV
output_csv_path = "output_relationships.csv"
save_relationships_to_csv(all_relationships, output_csv_path)

# Visualize the relationships with sentiment-based edges
visualize_relationships(all_relationships)


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use mps:0


KeyboardInterrupt: 

### USE THIS INSTEAD

In [17]:
# Load spaCy NLP model
nlp = spacy.load("en_core_web_sm")

# Load Hugging Face sentiment analysis model
sentiment_pipeline = pipeline("sentiment-analysis")

# OpenAI API Key (Set this securely in your environment)
client = openai.OpenAI(api_key="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") 

# Function to summarize text using OpenAI GPT-4o-mini
def summarize_text(text):
    """Summarize text and extract key organization relationships using OpenAI GPT-4o-mini."""
    prompt = """Identify the main points in the article provided.
    Given these main points, find relationships involving entities of type Organization.
    \n\nArticle:\n""" + text[:4000]  # Truncate to avoid API limits

    try:
        completion = client.chat.completions.create(
            model="gpt-4o-mini",
            store=True,
            messages=[{"role": "user", "content": prompt}]
        )
        summary = completion.choices[0].message.content
        return summary.strip()
    except Exception as e:
        print(f"Error during summarization: {e}")
        return text  # Fallback to original text if API fails

# Function to normalize organization names using DBpedia Spotlight
def normalize_org_name(org_name):
    """Normalize organization names using DBpedia Spotlight API."""
    url = "https://api.dbpedia-spotlight.org/en/annotate"
    headers = {"Accept": "application/json"}
    params = {"text": org_name, "confidence": 0.5}
    
    try:
        response = requests.get(url, headers=headers, params=params)
        if response.status_code == 200:
            data = response.json()
            if "Resources" in data:
                return data["Resources"][0]["@URI"].split("/")[-1]  # Extract DBpedia title
    except Exception as e:
        print(f"Error normalizing {org_name}: {e}")

    return org_name  # Return original if not found

# Function to analyze sentiment of a sentence
def get_sentiment(sentence):
    """Analyze sentiment of a sentence using Transformers."""
    result = sentiment_pipeline(sentence)
    sentiment_label = result[0]['label']  # 'POSITIVE' or 'NEGATIVE'
    confidence = result[0]['score']

    # If confidence is low, classify as NEUTRAL
    if confidence < 0.7:
        return 'NEUTRAL'
    return sentiment_label

# Function to extract and save summarized text from PDFs
def extract_text_from_pdfs(pdf_dir, output_csv="summarized_texts.csv"):
    """Extract text from PDFs, summarize using GPT, and save to CSV."""
    texts = []
    
    with open(output_csv, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["filename", "summarized_text"])  # CSV Header

        for filename in os.listdir(pdf_dir):
            if filename.endswith(".pdf"):
                filepath = os.path.join(pdf_dir, filename)
                with fitz.open(filepath) as doc:
                    text = "\n".join([page.get_text("text") for page in doc])
                    
                    # Summarize the extracted text before further processing
                    summarized_text = summarize_text(text)
                    
                    # Save to CSV
                    writer.writerow([filename, summarized_text])
                    
                    # Store in list for further processing
                    texts.append((summarized_text, filename))  

    return texts  # Return summarized texts if needed

# Function to extract relationships from text
def extract_relationships(text, filename, entity_label):
    """Extract relationships for a specific entity type (Country or Organization)."""
    doc = nlp(text)
    relationships = []

    for sent in doc.sents:
        sent_doc = nlp(sent.text)
        entities_in_sent = [e.text for e in sent_doc.ents if e.label_ == entity_label]
        entities_in_sent = list(set(entities_in_sent))  # Remove duplicates

        if len(entities_in_sent) >= 2:
            sentiment = get_sentiment(sent.text)
            for i in range(len(entities_in_sent) - 1):
                relationships.append({
                    'source': entities_in_sent[i],
                    'target': entities_in_sent[i + 1],
                    'sentence': sent.text,
                    'sentiment': sentiment,
                    'source_file': filename
                })
    
    return relationships

# Function to save relationships as CSV
def save_relationships_to_csv(relationships, output_path):
    df = pd.DataFrame(relationships, columns=["source", "target", "sentence", "sentiment", "source_file"])
    df.to_csv(output_path, index=False)

# Function to visualize relationships with filtering
def visualize_filtered_relationships(relationships, entity_type, output_file="network.html"):
    """Generate an interactive Pyvis graph with filtering for top entities."""
    
    net = Network(height="900px", width="100%", bgcolor="#222222", font_color="white", notebook=True, cdn_resources='in_line')
    net.force_atlas_2based(gravity=-30, central_gravity=0.02, spring_length=250, spring_strength=0.1)
    
    # Step 1: Count entity mentions
    entity_mentions = defaultdict(int)
    for rel in relationships:
        entity_mentions[rel["source"]] += 1
        entity_mentions[rel["target"]] += 1
    
    # Step 2: Keep only the top 30 most mentioned entities
    N = 30
    top_entities = sorted(entity_mentions.items(), key=lambda x: x[1], reverse=True)[:N]
    top_entities = {entity for entity, _ in top_entities}

    # Step 3: Filter relationships (keep only strong ones)
    strong_relationships = defaultdict(int)
    for r in relationships:
        strong_relationships[(r["source"], r["target"])] += 1
    
    filtered_relationships = [
        r for r in relationships if strong_relationships[(r["source"], r["target"])] >= 4
        and r["source"] in top_entities and r["target"] in top_entities
    ]

    # Step 4: Add nodes
    for entity in top_entities:
        color = "blue" if entity_type == "country" else "cyan"
        net.add_node(entity, label=entity, color=color, size=15)

    # Step 5: Add edges
    for relation in filtered_relationships:
        edge_color = "green" if relation["sentiment"] == "POSITIVE" else "red" if relation["sentiment"] == "NEGATIVE" else "gray"
        net.add_edge(relation["source"], relation["target"], width=2, color=edge_color, title=relation["sentence"])

    # Save and show the graph
    net.show(output_file)
    print(f"✅ Graph saved as {output_file}")

# Main script to process PDFs
pdf_dir = "/Users/benitaleonardi/Downloads/Datathon pdfs"
texts_with_sources = extract_text_from_pdfs(pdf_dir)

country_relationships = []
organization_relationships = []

for text, filename in texts_with_sources:
    country_relationships.extend(extract_relationships(text, filename, "GPE"))
    organization_relationships.extend(extract_relationships(text, filename, "ORG"))

# Save relationships to separate CSV files
save_relationships_to_csv(country_relationships, "country_relationships.csv")
save_relationships_to_csv(organization_relationships, "organization_relationships.csv")

# Visualize
visualize_filtered_relationships(country_relationships, "country", "country_network.html")
visualize_filtered_relationships(organization_relationships, "organization", "organization_network.html")


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use mps:0


country_network.html
✅ Graph saved as country_network.html
organization_network.html
✅ Graph saved as organization_network.html
