### INSTALL REQUIRED PACKAGE 

In [27]:
#!pip install textblob
#!pip install pycountry #this is to standardize all country names
#!pip install pandas openpyxl requests beautifulsoup4 spacy transformers pyvis
#!pip install pycountry-convert
#!pip install PyPDF2 pandas networkx spacy nltk tensorflow transformers textblob pyvis pycountry numpy

### IMPORT REQUIRED PACKAGE

In [3]:
import os
import PyPDF2
import pandas as pd
import networkx as nx
from collections import defaultdict
from pathlib import Path
import spacy
import nltk
import tensorflow as tf
from transformers import pipeline
from textblob import TextBlob
from pyvis.network import Network
import pycountry
net = Network(notebook=True)
import numpy as np

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"



In [4]:
#!python -m spacy download en_core_web_sm

### LOAD the CSV file

In [9]:
# Load the uploaded CSV file
file_path = "/Users/benitaleonardi/Documents/news_excerpts_parsed.csv"
df = pd.read_csv(file_path)

# first few rows of the datas

In [32]:
df.head()

Unnamed: 0,Link,Text
0,https://edition.cnn.com/2023/09/29/business/st...,Starbucks violated federal labor law when it i...
1,https://www.channelnewsasia.com/singapore/su-w...,The first suspect to plead guilty in Singapore...
2,https://edition.cnn.com/2023/05/22/tech/meta-f...,Meta has been fined a record-breaking €1.2 bil...
3,https://www.channelnewsasia.com/singapore/bill...,SINGAPORE: A 45-year-old man linked to Singapo...
4,https://edition.cnn.com/2024/03/05/politics/li...,The Department of Education imposed a record $...


In [33]:
df.size()

TypeError: 'int' object is not callable

### LOAD the PDFs

In [9]:
# VERSION 1

import requests
from bs4 import BeautifulSoup
import spacy
import pycountry
from collections import defaultdict
from transformers import pipeline
from pyvis.network import Network
import pandas as pd

# Load SpaCy NLP model
nlp = spacy.load("en_core_web_sm")

# Load Hugging Face Transformers Sentiment Model
sentiment_pipeline = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

class CountrySentimentAnalyzer:
    def __init__(self):
        """Initialize the text analyzer with NLP and data structures."""
        self.nlp = nlp  # Use loaded SpaCy model
        self.entities = defaultdict(set)
        self.relationships = []

    def standardize_country_name(self, country):
        """Attempt to standardize country names using pycountry."""
        try:
            return pycountry.countries.lookup(country).name
        except LookupError:
            return None  # Ignore non-country entities

    def get_sentiment(self, sentence):
        """Analyze sentiment of a sentence using Transformers."""
        result = sentiment_pipeline(sentence)
        sentiment_label = result[0]['label']  # 'POSITIVE' or 'NEGATIVE'
        confidence = result[0]['score']

        # If confidence is low, classify as NEUTRAL
        if confidence < 0.7:
            return 'NEUTRAL'
        return sentiment_label

    def extract_entities_and_relationships(self, text, source=None):
        """Extract country entities (GPE) and relationships from text."""
        doc = self.nlp(text)
        country_entities = set()

        for ent in doc.ents:
            if ent.label_ == "GPE":
                country_name = self.standardize_country_name(ent.text)
                if country_name:  # Only add valid countries
                    country_entities.add((country_name, source))

        # Store valid countries
        for country in country_entities:
            self.entities['GPE'].add(country)

        # Create relationships between countries in the same sentence
        for sent in doc.sents:
            sent_doc = self.nlp(sent.text)
            entities_in_sent = [self.standardize_country_name(e.text) for e in sent_doc.ents if e.label_ == "GPE"]
            entities_in_sent = [e for e in entities_in_sent if e]  # Remove None values

            if len(entities_in_sent) >= 2:
                sentiment = self.get_sentiment(sent.text)  # Get sentiment of the sentence
                for i in range(len(entities_in_sent) - 1):
                    self.relationships.append({
                        'source': entities_in_sent[i],
                        'target': entities_in_sent[i + 1],
                        'sentence': sent.text,
                        'sentiment': sentiment
                    })

    def scrape_webpage(self, url):
        """Scrape webpage content from a given URL."""
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")
            paragraphs = [p.text for p in soup.find_all("p")]
            return " ".join(paragraphs)  # Return all text as a single string
        except requests.RequestException as e:
            print(f"Failed to fetch {url}: {e}")
            return ""

    def process_csv(self, csv_file):
        """Process CSV file and analyze extracted text."""
        df = pd.read_csv(csv_file)

        for _, row in df.iterrows():
            link = row.get("Link", "").strip()
            text = row.get("Text", "").strip()

            if link:
                print(f"Scraping: {link}")
                web_text = self.scrape_webpage(link)
                self.extract_entities_and_relationships(web_text, source=link)
            
            if text:
                self.extract_entities_and_relationships(text, source="CSV Data")

    def visualize_relationships(self, output_file="country_network.html"):
        """Visualize country relationships using Pyvis with sentiment-based colors."""
        net = Network(height="750px", width="100%", bgcolor="#222222", font_color="white", notebook=True, cdn_resources='in_line')

        # Add nodes (countries)
        countries = set()
        for relation in self.relationships:
            countries.add(relation['source'])
            countries.add(relation['target'])

        for country in countries:
            net.add_node(country, label=country, color="blue", size=15)

        # Add edges with sentiment-based color
        for relation in self.relationships:
            sentiment = relation['sentiment']
            if sentiment == "POSITIVE":
                edge_color = "green"
            elif sentiment == "NEGATIVE":
                edge_color = "red"
            else:
                edge_color = "gray"

            net.add_edge(relation['source'], relation['target'], width=2, color=edge_color, title=relation['sentence'])

        # Save and show visualization
        net.show(output_file)

if __name__ == "__main__":
    csv_file = "/Users/benitaleonardi/Documents/news_excerpts_parsed_mini.csv"

    analyzer = CountrySentimentAnalyzer()

    # Process CSV data (scrape web content and analyze text)
    analyzer.process_csv(csv_file)

    # Visualize the combined network from CSV data
    analyzer.visualize_relationships("country_network_scrape.html")


Device set to use mps:0


Scraping: https://edition.cnn.com/2023/09/29/business/starbucks-union-wages/index.html
Scraping: https://www.channelnewsasia.com/singapore/su-wenqiang-pleads-guilty-billion-dollar-money-laundering-convicted-4234731
Scraping: https://edition.cnn.com/2023/05/22/tech/meta-facebook-data-privacy-eu-fine/index.html
Scraping: https://www.channelnewsasia.com/singapore/billion-dollar-money-laundering-case-zhang-ruijin-sentenced-15-months-jail-4302416
Scraping: https://edition.cnn.com/2024/03/05/politics/liberty-university-fined-campus-safety/index.html
Scraping: https://www.euronews.com/2024/02/23/judge-convicts-former-austrian-chancellor-sebastian-kurz
Scraping: https://edition.cnn.com/2022/07/21/economy/china-fines-didi-data-law-violation-intl-hnk/index.html
Scraping: https://www.brusselstimes.com/justice-belgium/1011990/two-alleged-leaders-of-vast-drug-trafficking-operation-looking-at-20-years-in-prison
Failed to fetch https://www.brusselstimes.com/justice-belgium/1011990/two-alleged-leaders

In [None]:
# VERSION 2

## the country_network_optimized_vr4 is >= 4

import requests
from bs4 import BeautifulSoup
import spacy
import pycountry
import re
from collections import defaultdict
from transformers import pipeline, AutoTokenizer
from pyvis.network import Network
import pandas as pd
import math
import pycountry_convert as pc 

# Load SpaCy NLP model
nlp = spacy.load("en_core_web_sm")

# Load Hugging Face Transformers Sentiment Model
sentiment_pipeline = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

# Load tokenizer for accurate splitting
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

class CountrySentimentAnalyzer:
    def __init__(self):                           
        """Initialize the text analyzer with NLP and data structures."""
        self.nlp = nlp  # Use loaded SpaCy model
        self.entities = defaultdict(set)
        self.relationships = []
        self.country_mentions = defaultdict(int)  # Track how often a country appears

    def clean_text(self, text):
        """Remove special characters, boilerplate content, and extra whitespace."""
        text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
        text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters & numbers
        return text.strip()

    def standardize_country_name(self, country):
        """Attempt to standardize country names using pycountry."""
        try:
            return pycountry.countries.lookup(country).name
        except LookupError:
            return None  # Ignore non-country entities

    def get_sentiment(self, sentence):
        """Analyze sentiment while handling long sentences (Max: 512 tokens)."""
        max_length = 512  # Model token limit
        
        # Tokenize the sentence
        tokens = tokenizer.tokenize(sentence)
        
        # If the tokenized input is too long, split into chunks
        if len(tokens) > max_length:
            chunks = [" ".join(tokenizer.convert_tokens_to_string(tokens[i:i + max_length])) 
                      for i in range(0, len(tokens), max_length)]
        else:
            chunks = [sentence]  # Use original if within limit
    
        # Analyze sentiment for each chunk separately
        sentiments = []
        for chunk in chunks:
            try:
                result = sentiment_pipeline(chunk)
                sentiments.append(result[0]['label'])
            except RuntimeError as e:
                print(f"Error processing chunk: {e}")
                return "NEUTRAL"  # Default to NEUTRAL if there's an error
    
        # Aggregate sentiment from all chunks
        positive_count = sentiments.count("POSITIVE")
        negative_count = sentiments.count("NEGATIVE")
    
        if positive_count > negative_count:
            return "POSITIVE"
        elif negative_count > positive_count:
            return "NEGATIVE"
        else:
            return "NEUTRAL"

    def extract_entities_and_relationships(self, text, source=None):
        """Extract country entities (GPE) and relationships from text."""
        cleaned_text = self.clean_text(text)
        doc = self.nlp(cleaned_text)
        country_entities = set()

        for ent in doc.ents:
            if ent.label_ == "GPE":
                country_name = self.standardize_country_name(ent.text)
                if country_name:  # Only add valid countries
                    country_entities.add(country_name)
                    self.country_mentions[country_name] += 1  # Track country frequency

        # Store valid countries
        for country in country_entities:
            self.entities['GPE'].add((country, source))

        # Create relationships between countries in the same sentence
        for sent in doc.sents:
            sent_doc = self.nlp(sent.text)
            entities_in_sent = [self.standardize_country_name(e.text) for e in sent_doc.ents if e.label_ == "GPE"]
            entities_in_sent = [e for e in entities_in_sent if e]  # Remove None values

            if len(entities_in_sent) >= 2:
                sentiment = self.get_sentiment(sent.text)  # Get sentiment of the sentence
                for i in range(len(entities_in_sent) - 1):
                    self.relationships.append({
                        'source': entities_in_sent[i],
                        'target': entities_in_sent[i + 1],
                        'sentence': sent.text,
                        'sentiment': sentiment
                    })

    def scrape_webpage(self, url):
        """Scrape webpage content from a given URL."""
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")
            paragraphs = [p.text for p in soup.find_all("p")]
            return " ".join(paragraphs)  # Return all text as a single string
        except requests.RequestException as e:
            print(f"Failed to fetch {url}: {e}")
            return ""

    def process_csv(self, csv_file):
        """Process CSV file and analyze extracted text."""
        df = pd.read_csv(csv_file)

        for _, row in df.iterrows():
            link = row.get("Link", "").strip()
            text = row.get("Text", "").strip()

            if link:
                print(f"Scraping: {link}")
                web_text = self.scrape_webpage(link)
                self.extract_entities_and_relationships(web_text, source=link)
            
            if text:
                self.extract_entities_and_relationships(text, source="CSV Data")

    def get_continent(self, country_name):
        """Map country name to a continent."""
        try:
            country_code = pycountry.countries.lookup(country_name).alpha_2
            continent_code = pc.country_alpha2_to_continent_code(country_code)
            continent_map = {
                "NA": "North America", "SA": "South America", "EU": "Europe",
                "AF": "Africa", "AS": "Asia", "OC": "Oceania"
            }
            return continent_map.get(continent_code, "Unknown")
        except:
            return "Unknown"

    def visualize_relationships(self, output_file="country_network.html"):
        """Visualize country relationships using Pyvis with continent-based color coding and improved spacing."""
        
        net = Network(height="900px", width="100%", bgcolor="#222222", font_color="white", notebook=True, cdn_resources='in_line')
        net.force_atlas_2based(gravity=-30, central_gravity=0.02, spring_length=250, spring_strength=0.1)
    
        # Step 1: Keep only the top 30 most mentioned countries
        N = 30
        top_countries = sorted(self.country_mentions.items(), key=lambda x: x[1], reverse=True)[:N]
        top_countries = {country for country, _ in top_countries}
    
        # Step 2: Track relationships and set a lower threshold (>= 2 mentions)
        strong_relationships = defaultdict(int)
        for r in self.relationships:
            strong_relationships[(r['source'], r['target'])] += 1
    
        filtered_relationships = [
            r for r in self.relationships if strong_relationships[(r['source'], r['target'])] >= 3
            and r['source'] in top_countries and r['target'] in top_countries
        ]

        # **Step 3: Ensure added_nodes set exists BEFORE using it**
        added_nodes = set()
    
        # Step 4: Add nodes with continent-based colors
        for country in top_countries:
            continent = self.get_continent(country)
            color = {
                "North America": "red", "South America": "green", "Europe": "blue",
                "Africa": "yellow", "Asia": "purple", "Oceania": "orange"
            }.get(continent, "gray")
    
            size = max(15, min(50, 10 * math.log1p(self.country_mentions[country])))
            net.add_node(country, label=country, color=color, size=size)
            added_nodes.add(country)  # ✅ Track that the node was added
    
        # Step 5: Add edges with sentiment-based colors
        for relation in filtered_relationships:
            source, target, sentiment = relation["source"], relation["target"], relation["sentiment"]
    
            # Ensure both nodes exist before adding the edge
            if source in added_nodes and target in added_nodes:
                edge_color = "green" if sentiment == "POSITIVE" else "red" if sentiment == "NEGATIVE" else "gray"
                net.add_edge(source, target, width=2, color=edge_color, title=relation["sentence"])
    
        # Save and show the graph
        net.show(output_file)

if __name__ == "__main__":
    analyzer = CountrySentimentAnalyzer()
    csv_file = "/Users/benitaleonardi/Documents/news_excerpts_parsed.csv"
    analyzer.process_csv(csv_file)
    analyzer.visualize_relationships("country_network_optimized_vr5.html")

Device set to use mps:0


Scraping: https://edition.cnn.com/2023/09/29/business/starbucks-union-wages/index.html
Scraping: https://www.channelnewsasia.com/singapore/su-wenqiang-pleads-guilty-billion-dollar-money-laundering-convicted-4234731
Scraping: https://edition.cnn.com/2023/05/22/tech/meta-facebook-data-privacy-eu-fine/index.html
Scraping: https://www.channelnewsasia.com/singapore/billion-dollar-money-laundering-case-zhang-ruijin-sentenced-15-months-jail-4302416
Scraping: https://edition.cnn.com/2024/03/05/politics/liberty-university-fined-campus-safety/index.html
Scraping: https://www.euronews.com/2024/02/23/judge-convicts-former-austrian-chancellor-sebastian-kurz
Scraping: https://edition.cnn.com/2022/07/21/economy/china-fines-didi-data-law-violation-intl-hnk/index.html
Scraping: https://www.brusselstimes.com/justice-belgium/1011990/two-alleged-leaders-of-vast-drug-trafficking-operation-looking-at-20-years-in-prison
Scraping: https://www.expats.cz/czech-news/article/former-czech-mp-gets-three-year-jail-t

Token indices sequence length is longer than the specified maximum sequence length for this model (643 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2343 > 512). Running this sequence through the model will result in indexing errors


Error processing chunk: The size of tensor a (2343) must match the size of tensor b (512) at non-singleton dimension 1
Scraping: https://yle.fi/a/3-12659444
Scraping: https://icelandmonitor.mbl.is/news/news/2021/10/21/sentenced_to_16_years_in_prison/
Scraping: https://www.ndtv.com/world-news/renault-nissan-alliance-carlos-ghosn-france-issues-international-arrest-warrant-against-ex-nissan-head-2909763
Scraping: https://edition.cnn.com/2023/04/19/tech/china-seagate-huawei-penalty-hnk-intl/index.html
Scraping: https://www.channelnewsasia.com/asia/woman-plead-guilty-negligence-caring-2-babies-nurul-shahira-3504176
Scraping: https://edition.cnn.com/2023/05/03/business/mcdonalds-child-labor-louisville/index.html
Scraping: https://edition.cnn.com/2023/04/07/europe/russia-chinese-lgbtq-blogger-arrested-intl/index.html
Scraping: https://edition.cnn.com/2021/07/30/tech/amazon-eu-privacy-fine/index.html
Scraping: https://edition.cnn.com/2024/03/07/europe/portugal-election-intl-cmd/index.html
Scra

In [18]:

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Load Hugging Face Transformers Sentiment Model
sentiment_pipeline = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

class PDFTextAnalyzer:
    def __init__(self):
        """Initialize the text analyzer with NLP and data structures."""
        self.nlp = nlp  # Use loaded SpaCy model
        self.entities = defaultdict(set)
        self.relationships = []

    def standardize_country_name(self, country):
        """Attempt to standardize country names using pycountry."""
        try:
            return pycountry.countries.lookup(country).name
        except LookupError:
            return None  # Ignore non-country entities

    def get_sentiment(self, sentence):
        """Analyze sentiment of a sentence using Transformers."""
        result = sentiment_pipeline(sentence)
        sentiment_label = result[0]['label']  # 'POSITIVE' or 'NEGATIVE'
        confidence = result[0]['score']

        # If confidence is low, classify as NEUTRAL
        if confidence < 0.7:
            return 'NEUTRAL'
        return sentiment_label

    def extract_entities_and_relationships(self, text, source=None):
        """Extract country entities (GPE) and relationships from text."""
        doc = self.nlp(text)
        country_entities = set()

        for ent in doc.ents:
            if ent.label_ == "GPE":
                country_name = self.standardize_country_name(ent.text)
                if country_name:  # Only add valid countries
                    country_entities.add((country_name, source))

        # Store valid countries
        for country in country_entities:
            self.entities['GPE'].add(country)

        # Create relationships between countries in the same sentence
        for sent in doc.sents:
            sent_doc = self.nlp(sent.text)
            entities_in_sent = [self.standardize_country_name(e.text) for e in sent_doc.ents if e.label_ == "GPE"]
            entities_in_sent = [e for e in entities_in_sent if e]  # Remove None values

            if len(entities_in_sent) >= 2:
                sentiment = self.get_sentiment(sent.text)  # Get sentiment of the sentence
                for i in range(len(entities_in_sent) - 1):
                    self.relationships.append({
                        'source': entities_in_sent[i],
                        'target': entities_in_sent[i + 1],
                        'sentence': sent.text,
                        'sentiment': sentiment
                    })

    def extract_text_from_pdf(self, pdf_file):
        """Extract text from a PDF file using PyPDF2."""
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            extracted_text = page.extract_text()
            if extracted_text:
                text += extracted_text
        return text

    def process_directory(self, directory):
        """Process all PDFs in a directory."""
        for filename in os.listdir(directory):
            if filename.endswith(".pdf"):
                pdf_path = os.path.join(directory, filename)
                with open(pdf_path, "rb") as pdf_file:
                    text = self.extract_text_from_pdf(pdf_file)
                    self.extract_entities_and_relationships(text, source=filename)

    
    def visualize_relationships(self, output_file="country_network.html"):
        """Visualize country relationships using Pyvis with sentiment-based colors & filtering."""
        
        # Initialize network graph
        net = Network(height="750px", width="100%", bgcolor="#222222", font_color="white", notebook=True, cdn_resources='in_line')
    
        # Improve graph spacing
        net.set_options('''
        var options = {
          "physics": {
            "barnesHut": {
              "gravitationalConstant": -2500,
              "springLength": 300,
              "damping": 0.7
            }
          }
        }
        ''')
    
        # Keep only relationships that appear at least 3 times
        strong_relationships = defaultdict(int)
        for r in self.relationships:
            strong_relationships[(r['source'], r['target'])] += 1
        
        filtered_relationships = [
            r for r in self.relationships if strong_relationships[(r['source'], r['target'])] >= 3
        ]

        # Collect unique country nodes from filtered relationships
        countries = set()
        for relation in filtered_relationships:
            countries.add(relation['source'])
            countries.add(relation['target'])
    
        # Add nodes with size based on mentions
        for country in countries:
            size = max(15, min(50, 8 * math.log1p(self.country_mentions[country])))  # Log-scaled size
            net.add_node(country, label=country, color="blue", size=size)
    
        # Add filtered edges with sentiment-based color
        for relation in filtered_relationships:
            sentiment = relation['sentiment']
            edge_color = {"POSITIVE": "green", "NEGATIVE": "red", "NEUTRAL": "gray"}.get(sentiment, "gray")
            net.add_edge(relation['source'], relation['target'], width=2, color=edge_color, title=relation['sentence'])

if __name__ == "__main__":
    pdf_directory = "/Users/benitaleonardi/Downloads/Datathon pdfs"
    #analyzer = PDFTextAnalyzer()
    #analyzer.process_directory(pdf_directory)  # Process all PDFs in the specified directory
    analyzer.visualize_relationships("country_network.html")  # Visualize the relationships between countries


Device set to use mps:0


country_network.html
