### INSTALL REQUIRED PACKAGE 

In [27]:
#!pip install textblob
#!pip install pycountry #this is to standardize all country names
#!pip install pandas openpyxl requests beautifulsoup4 spacy transformers pyvis
#!pip install pycountry-convert
#!pip install PyPDF2 pandas networkx spacy nltk tensorflow transformers textblob pyvis pycountry numpy

### IMPORT REQUIRED PACKAGE

In [1]:
import os
import PyPDF2
import pandas as pd
import networkx as nx
from collections import defaultdict
from pathlib import Path
import spacy
import nltk
# import tensorflow as tf
from transformers import pipeline
from textblob import TextBlob
from pyvis.network import Network
import pycountry
net = Network(notebook=True)
import numpy as np

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"



In [4]:
#!python -m spacy download en_core_web_sm

In [2]:
#pip install pycountry_convert

In [5]:
# VERSION 2

import requests
import spacy
import pycountry
import re
from collections import defaultdict
from transformers import pipeline, AutoTokenizer
from pyvis.network import Network
import pandas as pd
import math
import pycountry_convert as pc 
import csv


# Load SpaCy NLP model
nlp = spacy.load("en_core_web_sm")

# Load Hugging Face Transformers Sentiment Model
sentiment_pipeline = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

# Load tokenizer for accurate splitting
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

class CountrySentimentAnalyzer:
    def __init__(self):                           
        """Initialize the text analyzer with NLP and data structures."""
        self.nlp = nlp  # Use loaded SpaCy model
        self.entities = defaultdict(set)
        self.relationships = []
        self.country_mentions = defaultdict(int)  # Track how often a country appears

    def clean_text(self, text):
        """Remove special characters, boilerplate content, and extra whitespace."""
        text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
        text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters & numbers
        return text.strip()    

    def standardize_country_name(self, country):
        """Standardize country names, including aliases for better recognition."""
        
        country_aliases = {
            "uk": "United Kingdom",
            "u.k.": "United Kingdom",
            "united kingdom": "United Kingdom",
            "the united kingdom": "United Kingdom",
            "great britain": "United Kingdom",
            "gb": "United Kingdom",
            "russia": "Russian Federation",
            "ussr": "Russian Federation",
            "iran": "Islamic Republic of Iran",
            "north korea": "Democratic People's Republic of Korea",
            "south korea": "Republic of Korea",
            "kosovo": "Kosovo",
            "pristina": "Kosovo",
            "us": "United States",
            "u.s.": "United States",
            "u.s.a.": "United States",
            "america": "United States",
        }
        
        country_lower = country.lower().strip()  # Normalize input

        # ✅ First, check in the alias dictionary
        if country_lower in country_aliases:
            return country_aliases[country_lower]

        # ✅ Try PyCountry as fallback
        try:
            return pycountry.countries.lookup(country).name
        except LookupError:
            print(f"❌ Unrecognized country: {country}")
            return None  # Ignore unrecognized countries


    def get_sentiment(self, sentence):
        """Analyze sentiment while handling long sentences (Max: 512 tokens)."""
        max_length = 512  # Model token limit
        
        # Tokenize the sentence
        tokens = tokenizer.tokenize(sentence)
        
        # If the tokenized input is too long, split into chunks
        if len(tokens) > max_length:
            chunks = [" ".join(tokenizer.convert_tokens_to_string(tokens[i:i + max_length])) 
                      for i in range(0, len(tokens), max_length)]
        else:
            chunks = [sentence]  # Use original if within limit
    
        # Analyze sentiment for each chunk separately
        sentiments = []
        for chunk in chunks:
            try:
                result = sentiment_pipeline(chunk)
                sentiments.append(result[0]['label'])
            except RuntimeError as e:
                print(f"Error processing chunk: {e}")
                return "NEUTRAL"  # Default to NEUTRAL if there's an error
    
        # Aggregate sentiment from all chunks
        positive_count = sentiments.count("POSITIVE")
        negative_count = sentiments.count("NEGATIVE")
    
        if positive_count > negative_count:
            return "POSITIVE"
        elif negative_count > positive_count:
            return "NEGATIVE"
        else:
            return "NEUTRAL"

    def extract_entities_and_relationships(self, text, source=None):
        """Extract country entities (GPE) and relationships from text."""
        cleaned_text = self.clean_text(text)
        doc = self.nlp(cleaned_text)
        country_entities = set()

        for ent in doc.ents:
            if ent.label_ == "GPE":
                country_name = self.standardize_country_name(ent.text)
                if country_name:  # Only add valid countries
                    country_entities.add(country_name)
                    self.country_mentions[country_name] += 1  # Track country frequency

        # Store valid countries
        for country in country_entities:
            self.entities['GPE'].add((country, source))

        # Create relationships between countries in the same sentence
        for sent in doc.sents:
            sent_doc = self.nlp(sent.text)
            entities_in_sent = [self.standardize_country_name(e.text) for e in sent_doc.ents if e.label_ == "GPE"]
            entities_in_sent = [e for e in entities_in_sent if e]  # Remove None values

            if len(entities_in_sent) >= 2:
                sentiment = self.get_sentiment(sent.text)  # Get sentiment of the sentence
                for i in range(len(entities_in_sent) - 1):
                    relationship = {
                        'source': entities_in_sent[i],
                        'target': entities_in_sent[i + 1],
                        'sentence': sent.text,
                        'sentiment': sentiment
                    }
                    # print(f"Extracted Relationship: {relationship}")  # Debugging print
                    self.relationships.append(relationship)

    def process_text_files(self, text_folder):
        """Process text files extracted from PDFs and analyze them."""
        text_files = [f for f in os.listdir(text_folder) if f.endswith('.txt')]
        
        for text_file in text_files:
            file_path = os.path.join(text_folder, text_file)
            
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    text = f.read()
                    
                print(f"Processing: {text_file}")
                self.extract_entities_and_relationships(text, source=text_file)
            
            except Exception as e:
                print(f"Failed to process {text_file}: {e}")
    

    def save_relationships_to_csv(self, filename="extracted_relationships.csv"):
        """Save extracted country relationships to a CSV file."""
        
        # ✅ Debugging: Check if relationships exist before saving
        if not self.relationships:
            print("⚠️ No relationships were extracted! Check text processing.")
            return  # Exit early if no relationships exist
        
        print(f"✅ Preparing to save {len(self.relationships)} relationships to {filename}...")  # Debugging

        # ✅ Open the file for writing
        with open(filename, mode="w", newline="", encoding="utf-8") as file:
            writer = csv.DictWriter(file, fieldnames=["source", "target", "sentence", "sentiment"])
            writer.writeheader()

            for relationship in self.relationships:
                print(f"📝 Writing to CSV: {relationship}")  # Debugging
                writer.writerow(relationship)  # ✅ Writing the data

        print(f"✅ Relationships successfully saved to {filename}")


    def get_continent(self, country_name):
        """Map country name to a continent."""
        try:
            country_code = pycountry.countries.lookup(country_name).alpha_2
            continent_code = pc.country_alpha2_to_continent_code(country_code)
            continent_map = {
                "NA": "North America", "SA": "South America", "EU": "Europe",
                "AF": "Africa", "AS": "Asia", "OC": "Oceania"
            }
            return continent_map.get(continent_code, "Unknown")
        except:
            return "Unknown"

    def generate_filtered_pyvis_graph(country="All", sentiment="All", region="All", output_file="country_network_filtered.html"):
        """Generate a Pyvis graph with applied filters."""
        
        relationships, country_mentions = load_data()
    
        net = Network(height="900px", width="100%", bgcolor="#222222", font_color="white", notebook=True, cdn_resources='in_line')
        net.force_atlas_2based(gravity=-30, central_gravity=0.02, spring_length=250, spring_strength=0.1)
    
        # Apply Filters
        filtered_relationships = [
            r for r in relationships
            if (country == "All" or r['source'] == country or r['target'] == country) and
               (sentiment == "All" or r['sentiment'].lower() == sentiment.lower()) and
               (region == "All" or get_continent(r['source']) == region or get_continent(r['target']) == region)
        ]
    
        # Add Nodes
        added_nodes = set()
        for relation in filtered_relationships:
            added_nodes.add(relation['source'])
            added_nodes.add(relation['target'])
    
        for country in added_nodes:
            continent = get_continent(country)
            color = {
                "North America": "red", "South America": "green", "Europe": "blue",
                "Africa": "yellow", "Asia": "purple", "Oceania": "orange"
            }.get(continent, "gray")
    
            size = max(15, min(50, 10 * math.log1p(country_mentions.get(country, 1))))
            net.add_node(country, label=country, color=color, size=size)
    
        # Add Edges
        for relation in filtered_relationships:
            source, target, sentiment = relation["source"], relation["target"], relation["sentiment"]
            edge_color = "green" if sentiment == "POSITIVE" else "red" if sentiment == "NEGATIVE" else "gray"
            net.add_edge(source, target, width=2, color=edge_color, title=relation["sentence"])
    
        net.show(output_file)
        print(f"✅ Graph updated: {output_file}")


if __name__ == "__main__":
    analyzer = CountrySentimentAnalyzer()
    text_folder = "../processed/text"  # Replace with your actual path
    analyzer.process_text_files(text_folder)
    analyzer.save_relationships_to_csv("extracted_relationships.csv")
    analyzer.visualize_relationships("country_network_pdf_v6.html")


Device set to use mps:0


Processing: 45.pdf.txt
Processing: 27.pdf.txt
Processing: 107.pdf.txt
Processing: 11.pdf.txt
Processing: 63.pdf.txt
❌ Unrecognized country: Monrovia


Token indices sequence length is longer than the specified maximum sequence length for this model (551 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2346 > 512). Running this sequence through the model will result in indexing errors


❌ Unrecognized country: Monrovia
Error processing chunk: The size of tensor a (2346) must match the size of tensor b (512) at non-singleton dimension 1
Processing: 2.pdf.txt
❌ Unrecognized country: ATCS  Examined ATCS bank
❌ Unrecognized country: Employee
❌ Unrecognized country: Airport Prishtina
❌ Unrecognized country: London
❌ Unrecognized country: Prishtina
❌ Unrecognized country: Prishtina
❌ Unrecognized country: ATCS  Examined ATCS bank
❌ Unrecognized country: Employee
❌ Unrecognized country: Airport Prishtina
Error processing chunk: The size of tensor a (513) must match the size of tensor b (512) at non-singleton dimension 1
❌ Unrecognized country: London
❌ Unrecognized country: Prishtina
❌ Unrecognized country: Prishtina
Error processing chunk: The size of tensor a (2344) must match the size of tensor b (512) at non-singleton dimension 1
Processing: 73.pdf.txt
❌ Unrecognized country: New York
❌ Unrecognized country: Nairobi
❌ Unrecognized country: New York
❌ Unrecognized country

In [22]:
analyzer.visualize_relationships("country_network_pdf_v1.html")

country_network_pdf_v1.html


In [None]:

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Load Hugging Face Transformers Sentiment Model
sentiment_pipeline = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

class PDFTextAnalyzer:
    def __init__(self):
        """Initialize the text analyzer with NLP and data structures."""
        self.nlp = nlp  # Use loaded SpaCy model
        self.entities = defaultdict(set)
        self.relationships = []

    def standardize_country_name(self, country):
        """Attempt to standardize country names using pycountry."""
        try:
            return pycountry.countries.lookup(country).name
        except LookupError:
            return None  # Ignore non-country entities

    def get_sentiment(self, sentence):
        """Analyze sentiment of a sentence using Transformers."""
        result = sentiment_pipeline(sentence)
        sentiment_label = result[0]['label']  # 'POSITIVE' or 'NEGATIVE'
        confidence = result[0]['score']

        # If confidence is low, classify as NEUTRAL
        if confidence < 0.7:
            return 'NEUTRAL'
        return sentiment_label

    def extract_entities_and_relationships(self, text, source=None):
        """Extract country entities (GPE) and relationships from text."""
        doc = self.nlp(text)
        country_entities = set()

        for ent in doc.ents:
            if ent.label_ == "GPE":
                country_name = self.standardize_country_name(ent.text)
                if country_name:  # Only add valid countries
                    country_entities.add((country_name, source))

        # Store valid countries
        for country in country_entities:
            self.entities['GPE'].add(country)

        # Create relationships between countries in the same sentence
        for sent in doc.sents:
            sent_doc = self.nlp(sent.text)
            entities_in_sent = [self.standardize_country_name(e.text) for e in sent_doc.ents if e.label_ == "GPE"]
            entities_in_sent = [e for e in entities_in_sent if e]  # Remove None values

            if len(entities_in_sent) >= 2:
                sentiment = self.get_sentiment(sent.text)  # Get sentiment of the sentence
                for i in range(len(entities_in_sent) - 1):
                    self.relationships.append({
                        'source': entities_in_sent[i],
                        'target': entities_in_sent[i + 1],
                        'sentence': sent.text,
                        'sentiment': sentiment
                    })

    def extract_text_from_pdf(self, pdf_file):
        """Extract text from a PDF file using PyPDF2."""
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            extracted_text = page.extract_text()
            if extracted_text:
                text += extracted_text
        return text

    def process_directory(self, directory):
        """Process all PDFs in a directory."""
        for filename in os.listdir(directory):
            if filename.endswith(".pdf"):
                pdf_path = os.path.join(directory, filename)
                with open(pdf_path, "rb") as pdf_file:
                    text = self.extract_text_from_pdf(pdf_file)
                    self.extract_entities_and_relationships(text, source=filename)

    
    def visualize_relationships(self, output_file="country_network.html"):
        """Visualize country relationships using Pyvis with sentiment-based colors & filtering."""
        
        # Initialize network graph
        net = Network(height="750px", width="100%", bgcolor="#222222", font_color="white", notebook=True, cdn_resources='in_line')
    
        # Improve graph spacing
        net.set_options('''
        var options = {
          "physics": {
            "barnesHut": {
              "gravitationalConstant": -2500,
              "springLength": 300,
              "damping": 0.7
            }
          }
        }
        ''')
    
        # Keep only relationships that appear at least 3 times
        strong_relationships = defaultdict(int)
        for r in self.relationships:
            strong_relationships[(r['source'], r['target'])] += 1
        
        filtered_relationships = [
            r for r in self.relationships if strong_relationships[(r['source'], r['target'])] >= 1
        ]

        # Collect unique country nodes from filtered relationships
        countries = set()
        for relation in filtered_relationships:
            countries.add(relation['source'])
            countries.add(relation['target'])
    
        # Add nodes with size based on mentions
        for country in countries:
            size = max(15, min(50, 8 * math.log1p(self.country_mentions[country])))  # Log-scaled size
            net.add_node(country, label=country, color="blue", size=size)
    
        # Add filtered edges with sentiment-based color
        for relation in filtered_relationships:
            sentiment = relation['sentiment']
            edge_color = {"POSITIVE": "green", "NEGATIVE": "red", "NEUTRAL": "gray"}.get(sentiment, "gray")
            net.add_edge(relation['source'], relation['target'], width=2, color=edge_color, title=relation['sentence'])

if __name__ == "__main__":
    pdf_directory = "/Users/benitaleonardi/Downloads/Datathon pdfs"
    #analyzer = PDFTextAnalyzer()
    #analyzer.process_directory(pdf_directory)  # Process all PDFs in the specified directory
    analyzer.visualize_relationships("country_network.html")  # Visualize the relationships between countries


Device set to use mps:0


country_network.html


### BENITA TESTING

In [26]:
# Load the uploaded CSV file
file_path = "/Users/benitaleonardi/Documents/GitHub/pythoncharmers/summarized_texts.csv"

# Read CSV while handling possible formatting issues
df = pd.read_csv(file_path, on_bad_lines="skip", quoting=csv.QUOTE_NONE, encoding="utf-8")

# first few rows of the datas
df.head()

Unnamed: 0,filename,summarized_text
0,49.pdf,"""Here are the main points of the article:"
1,2. **Context of Allegation**: The envelope was...,attended by multiple parties including a KTA ...
2,3. **Legal Framework**: The investigation invo...,
3,4. **Investigation Methodology**: The Investig...,including the involvement of the KTA Procurem...
4,5. **Reporting and Documentation**: The Divisi...,but records were missing. Relevant documents ...


In [None]:
# VERSION 2

import requests
import spacy
import pycountry
import re
from collections import defaultdict
from transformers import pipeline, AutoTokenizer
from pyvis.network import Network
import pandas as pd
import math
import pycountry_convert as pc 
import csv


# Load SpaCy NLP model
nlp = spacy.load("en_core_web_sm")

# Load Hugging Face Transformers Sentiment Model
sentiment_pipeline = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

# Load tokenizer for accurate splitting
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

class CountrySentimentAnalyzer:
    def __init__(self):                           
        """Initialize the text analyzer with NLP and data structures."""
        self.nlp = nlp  # Use loaded SpaCy model
        self.entities = defaultdict(set)
        self.relationships = []
        self.country_mentions = defaultdict(int)  # Track how often a country appears

    def clean_text(self, text):
        """Remove special characters, boilerplate content, and extra whitespace."""
        text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
        text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters & numbers
        return text.strip()    

    def standardize_country_name(self, country):
        """Standardize country names, including aliases for better recognition."""
        
        country_aliases = {
            "uk": "United Kingdom",
            "u.k.": "United Kingdom",
            "united kingdom": "United Kingdom",
            "the united kingdom": "United Kingdom",
            "great britain": "United Kingdom",
            "gb": "United Kingdom",
            "russia": "Russian Federation",
            "ussr": "Russian Federation",
            "iran": "Islamic Republic of Iran",
            "north korea": "Democratic People's Republic of Korea",
            "south korea": "Republic of Korea",
            "kosovo": "Kosovo",
            "pristina": "Kosovo",
            "us": "United States",
            "u.s.": "United States",
            "u.s.a.": "United States",
            "america": "United States",
        }
        
        country_lower = country.lower().strip()  # Normalize input

        # ✅ First, check in the alias dictionary
        if country_lower in country_aliases:
            return country_aliases[country_lower]

        # ✅ Try PyCountry as fallback
        try:
            return pycountry.countries.lookup(country).name
        except LookupError:
            print(f"❌ Unrecognized country: {country}")
            return None  # Ignore unrecognized countries


    def get_sentiment(self, sentence):
        """Analyze sentiment while handling long sentences (Max: 512 tokens)."""
        max_length = 512  # Model token limit
        
        # Tokenize the sentence
        tokens = tokenizer.tokenize(sentence)
        
        # If the tokenized input is too long, split into chunks
        if len(tokens) > max_length:
            chunks = [" ".join(tokenizer.convert_tokens_to_string(tokens[i:i + max_length])) 
                      for i in range(0, len(tokens), max_length)]
        else:
            chunks = [sentence]  # Use original if within limit
    
        # Analyze sentiment for each chunk separately
        sentiments = []
        for chunk in chunks:
            try:
                result = sentiment_pipeline(chunk)
                sentiments.append(result[0]['label'])
            except RuntimeError as e:
                print(f"Error processing chunk: {e}")
                return "NEUTRAL"  # Default to NEUTRAL if there's an error
    
        # Aggregate sentiment from all chunks
        positive_count = sentiments.count("POSITIVE")
        negative_count = sentiments.count("NEGATIVE")
    
        if positive_count > negative_count:
            return "POSITIVE"
        elif negative_count > positive_count:
            return "NEGATIVE"
        else:
            return "NEUTRAL"

    def extract_entities_and_relationships(self, text, source=None):
        """Extract country entities (GPE) and relationships from text."""
        cleaned_text = self.clean_text(text)
        doc = self.nlp(cleaned_text)
        country_entities = set()

        for ent in doc.ents:
            if ent.label_ == "GPE":
                country_name = self.standardize_country_name(ent.text)
                if country_name:  # Only add valid countries
                    country_entities.add(country_name)
                    self.country_mentions[country_name] += 1  # Track country frequency

        # Store valid countries
        for country in country_entities:
            self.entities['GPE'].add((country, source))

        # Create relationships between countries in the same sentence
        for sent in doc.sents:
            sent_doc = self.nlp(sent.text)
            entities_in_sent = [self.standardize_country_name(e.text) for e in sent_doc.ents if e.label_ == "GPE"]
            entities_in_sent = [e for e in entities_in_sent if e]  # Remove None values

            if len(entities_in_sent) >= 2:
                sentiment = self.get_sentiment(sent.text)  # Get sentiment of the sentence
                for i in range(len(entities_in_sent) - 1):
                    relationship = {
                        'source': entities_in_sent[i],
                        'target': entities_in_sent[i + 1],
                        'sentence': sent.text,
                        'sentiment': sentiment
                    }
                    # print(f"Extracted Relationship: {relationship}")  # Debugging print
                    self.relationships.append(relationship)

    def process_text_files(self, text_folder):
        """Process text files extracted from PDFs and analyze them."""
        text_files = [f for f in os.listdir(text_folder) if f.endswith('.txt')]
        
        for text_file in text_files:
            file_path = os.path.join(text_folder, text_file)
            
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    text = f.read()
                    
                print(f"Processing: {text_file}")
                self.extract_entities_and_relationships(text, source=text_file)
            
            except Exception as e:
                print(f"Failed to process {text_file}: {e}")
    

    def save_relationships_to_csv(self, filename="extracted_relationships.csv"):
        """Save extracted country relationships to a CSV file."""
        
        # ✅ Debugging: Check if relationships exist before saving
        if not self.relationships:
            print("⚠️ No relationships were extracted! Check text processing.")
            return  # Exit early if no relationships exist
        
        print(f"✅ Preparing to save {len(self.relationships)} relationships to {filename}...")  # Debugging

        # ✅ Open the file for writing
        with open(filename, mode="w", newline="", encoding="utf-8") as file:
            writer = csv.DictWriter(file, fieldnames=["source", "target", "sentence", "sentiment"])
            writer.writeheader()

            for relationship in self.relationships:
                print(f"📝 Writing to CSV: {relationship}")  # Debugging
                writer.writerow(relationship)  # ✅ Writing the data

        print(f"✅ Relationships successfully saved to {filename}")


    def get_continent(self, country_name):
        """Map country name to a continent."""
        try:
            country_code = pycountry.countries.lookup(country_name).alpha_2
            continent_code = pc.country_alpha2_to_continent_code(country_code)
            continent_map = {
                "NA": "North America", "SA": "South America", "EU": "Europe",
                "AF": "Africa", "AS": "Asia", "OC": "Oceania"
            }
            return continent_map.get(continent_code, "Unknown")
        except:
            return "Unknown"

    def visualize_relationships(self, output_file="country_network.html"):

        print(f"Total Relationships Extracted: {len(self.relationships)}")
        print("Sample Relationships:", self.relationships[:10])  # Print first 10 relationships
        """Visualize country relationships using Pyvis with continent-based color coding and improved spacing."""
        
        net = Network(height="900px", width="100%", bgcolor="#222222", font_color="white", notebook=True, cdn_resources='in_line')
        # net.force_atlas_2based(gravity=-30, central_gravity=0.02, spring_length=250, spring_strength=0.1)
        net.force_atlas_2based(
            gravity=-100,               # Increase repulsion force (default is -30)
            central_gravity=0.01,       # Keeps nodes closer to center
            spring_length=400,          # Increase edge length (default ~250)
            spring_strength=0.1,        # Adjust attraction force
            damping=0.9                 # Slow down node movement
        )


        # Step 1: Keep only the top 30 most mentioned countries
        N = 30
        top_countries = sorted(self.country_mentions.items(), key=lambda x: x[1], reverse=True)[:N]
        top_countries = {country for country, _ in top_countries}
    
        # Step 2: Track relationships and set a lower threshold (>= 2 mentions)
        strong_relationships = defaultdict(int)
        for r in self.relationships:
            strong_relationships[(r['source'], r['target'])] += 1


        filtered_relationships = [
            r for r in self.relationships 
            if strong_relationships[(r['source'], r['target'])] >= 3  # Increase minimum edge occurrences
            and r['source'] in top_countries 
            and r['target'] in top_countries
        ]

        # **Step 3: Ensure added_nodes set exists BEFORE using it**
        added_nodes = set()
    
        # Step 4: Add nodes with continent-based colors
        for country in top_countries:
            continent = self.get_continent(country)
            color = {
                "North America": "red", "South America": "green", "Europe": "blue",
                "Africa": "yellow", "Asia": "purple", "Oceania": "orange"
            }.get(continent, "gray")
    
            size = max(15, min(50, 10 * math.log1p(self.country_mentions[country])))
            net.add_node(
                country, 
                label=country, 
                color=color, 
                size=size, 
                font={'size': 15}  # Reduce font size (default is 20+)
            )
            added_nodes.add(country)  # ✅ Track that the node was added
    
        # Step 5: Add edges with sentiment-based colors

        
        for relation in filtered_relationships:
            source, target, sentiment = relation["source"], relation["target"], relation["sentiment"]
    
            # Ensure both nodes exist before adding the edge
            if source in added_nodes and target in added_nodes:
                edge_color = "green" if sentiment == "POSITIVE" else "red" if sentiment == "NEGATIVE" else "gray"
                net.add_edge(source, target, width=2, color=edge_color, title=relation["sentence"])
    
        # Save and show the graph
        net.show(output_file)

if __name__ == "__main__":
    analyzer = CountrySentimentAnalyzer()
    text_folder = "../processed/text"  # Replace with your actual path
    analyzer.process_text_files(text_folder)
    analyzer.save_relationships_to_csv("extracted_relationships.csv")
    analyzer.visualize_relationships("country_network_pdf_v1.html")
