#### Import Stuff

In [2]:
#Import Stuff
import feedparser
import spacy
import os
from dateutil.parser import parse
import ast
from bs4 import BeautifulSoup
import re
import sqlite3
import pandas as pd

#### Feed DB Generator

In [24]:
def get_content(path, num_entries):
    feed_db = pd.read_csv(path)
    output_data = []

    for index, row in feed_db.iterrows():
        feed_name = row['feed_name']
        feed_url = row['feed_url']
        test = feedparser.parse(feed_url)

        for i, entry in enumerate(test.entries):
            if i >= num_entries:
                break

            title = get_field_safe(entry, 'title')
            published = get_field_safe(entry, 'published')
            link = get_field_safe(entry, 'link')
            summary = get_field_safe(entry, 'summary')
            id_ = get_field_safe(entry, 'id')
            tags = get_field_safe(entry, 'tags')
            author_names = get_field_safe(entry, 'author_names')

            # Remove HTML tags from summary
            if summary:
                summary = remove_html_tags(summary)

            output_data.append({
                'Feed Name': feed_name,
                'Title': title,
                'Published': published,
                'Link': link,
                'Summary': summary,
                'ID': id_,
                'Tags': tags,
                'Author Names': author_names
            })

    output_df = pd.DataFrame(output_data)
    output_df.to_csv('output.csv', index=False)


# Function to safely retrieve field or return None
def get_field_safe(entry, field):
    try:
        if field == 'tags':
            return [tag.term for tag in entry.tags]
        elif field == 'author_names':
            return [author.name for author in entry.authors]
        else:
            return getattr(entry, field)
    except AttributeError:
        return None


# Function to remove HTML tags from a string
def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()


# Example usage
path = r'C:\Users\ZzTHE\Desktop\local\ThreatAtlas-Simons-10-06-2023-Branch\test_feeds.csv'
num_entries = 10  # Set the number of entries to parse
get_content(path, num_entries)


  soup = BeautifulSoup(text, "html.parser")


#### Event DB Generator

In [18]:
# Load the spaCy English model
nlp = spacy.load("en_core_web_trf")

# Set CSV path
path = "output.csv"

# Read the data
data = pd.read_csv(path)

# Select specific item data
item_data = data.loc[10, ['Title', 'Summary', 'Tags']]

# Extract countries
def extract_countries(article):
    doc = nlp(article)
    countries = []
    
    for ent in doc.ents:
        if ent.label_ == "GPE":
            countries.append(ent.text)
    
    return list(set(countries))

# Extract individuals
def extract_individuals(article):
    doc = nlp(article)
    individuals = []
    
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            individuals.append(ent.text)
    
    return list(set(individuals))

# Extract organizations
def extract_organizations(article):
    doc = nlp(article)
    organizations = []
    
    for ent in doc.ents:
        if ent.label_ == "ORG":
            organizations.append(ent.text)
    
    return list(set(organizations))

# Extract locations
def extract_locations(article):
    doc = nlp(article)
    locations = []
    
    for ent in doc.ents:
        if ent.label_ == "LOC":
            locations.append(ent.text)
    
    return list(set(locations))

# Extract individual elements from the Series
title = str(item_data['Title'])
summary = str(item_data['Summary'])
tags = str(item_data['Tags'])

# Print the item data and extracted information
print("Title:", title)
print("Summary:", summary)
print("Tags:", tags)

# Combine the text from title, summary, and tags
combined_text = title + " " + summary + " " + tags

# Extract and print the entities
print("Countries:", extract_countries(combined_text))
print("Individuals:", extract_individuals(combined_text))
print("Organizations:", extract_organizations(combined_text))
print("Locations:", extract_locations(combined_text))

Title: Trafficking in the Sahel: Muzzling the illicit arms trade
Summary: Shoppers in Mali’s Gao, Timbuktu, and Ménaka regions can snap up AK-pattern assault rifles for $750 and cartridges for 70 cents apiece, from locally handcrafted pistols to smuggled French and Turkish machine guns, as a dizzying array of illegal weaponry dots market stalls across the Sahel, a 6,000-kilometre-wide belt in the middle of Africa. Read the full story, “Trafficking in the Sahel: Muzzling the illicit arms trade”, on globalissues.org →
Tags: nan
Countries: ['Gao', 'Mali', 'Timbuktu', 'Ménaka']
Individuals: []
Organizations: ['globalissues.org']
Locations: ['Sahel', 'Africa']
