# Major script stash

In [None]:
# Import everything
import pandas as pd
import os
import urllib.request
from zipfile import ZipFile
from datetime import datetime, timedelta, date
import csv
import pandas as pd
from datetime import datetime
import spacy
from dateparser import parse
import feedparser


### Event extraction

#### Feed generation

In [None]:
def fetch_articles_from_rss(feed_url, source):
    feed = feedparser.parse(feed_url)

    articles = []

    for entry in feed.entries:

        articles.append(entry)

    return articles




path = 'test_feeds.csv'
def get_content(path):
    feed_db = pd.read_csv(path)

    # zip relevant cols 
    reqs = list(zip(feed_db['feed_name'].tolist(),feed_db['feed_url'].tolist()))
    # fetch articles
    
    article_dump = []
    for req in reqs:
        articles = fetch_articles_from_rss(req[1],req[0])
        article_dump.append((req[0],articles))
        # print("  ")
        # print("Fetched successfully "+"feed: "+req[0])
        # print([article['title'] for article in articles])


    return dict(article_dump)


def flatten_list(_2d_list):
    flat_list = []
    # Iterate through the outer list
    for element in _2d_list:
        if type(element) is list:
            # If the element is of type list, iterate through the sublist
            for item in element:
                flat_list.append(item)
        else:
            flat_list.append(element)
    return flat_list


        
def output_to_csv(sources, data):
    dump = []
    for source in sources: 
        for article_ in data[source]:
            article = dict(article_)
            dump.append(article)

    test_db = pd.DataFrame(dump)
    test_db.to_csv('test_output.csv')
    return test_db



In [None]:
test = get_content(path)
sources = list(test.keys())
test_db = output_to_csv(sources, test)

#### Event detection

In [None]:
nlp = spacy.load("en_core_web_trf")

def extract_countries(article):
    doc = nlp(article)
    countries = []
    
    for ent in doc.ents:
        if ent.label_ == "GPE" or ent.label_ == "NORP" or ent.label_ == "LOC":
            countries.append(ent.text)
    
    return list(set(countries))

def extract_individuals(article):
    doc = nlp(article)
    individuals = []
    
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            individuals.append(ent.text)
    
    return list(set(individuals))

def extract_organizations(article):
    doc = nlp(article)
    organizations = []
    
    for ent in doc.ents:
        if ent.label_ == "ORG":
            organizations.append(ent.text)
    
    return list(set(organizations))

def key_phrases(article):
    phrases = []
    doc = nlp(article)
    for chunk in doc.noun_chunks:
        phrases.append(chunk)
        
    return phrases


#### Feed Characteristics

In [None]:

def extract_event(article):
    countries = extract_countries(article)
    individuals = extract_individuals(article)
    organizations = extract_organizations(article)
    phrases = key_phrases(article)
    url = None


    current_event = {
        "countries_involved": countries,
        "associated_individuals": individuals,
        "associated_organizations": organizations,
        "key_phrases": phrases,
    }
    
    return current_event
# Example usage
article = "From the Field: UN human rights officers on the frontline in Somalia Fleeing armed conflict is frightening, forcing people to escape violent clashes and leave behind their homes, schools, and daily lives in a desperate search for safety. Read the full story, â€œFrom the Field: UN human rights officers on the frontline in Somalia"
event = extract_event(article)
# print("Description:", event.get("description"))
print("Countries involved:", event.get("countries_involved"))
print("Associated individuals:", event.get("associated_individuals"))
print("Associated organizations:", event.get("associated_organizations"))
print("Key phrases:", event.get("key_phrases"))

In [None]:
## Extract text from news articles 
articles = []
headers = ['Countries involved', 'Date', 'Associated Individuals',
           'Associated Organizations', 'Countries Involved', 'Location', 'url' ]

def eventlist(articles):
    events = pd.dataframe(names=headers)
    for article in articles:
        extract_event(article)

    return events

### GDELT Downloader

In [None]:
selection = int(input("1) get the latest file 2) get a date range"))
header = "GLOBALEVENTID SQLDATE MonthYear Year FractionDate Actor1Code Actor1Name Actor1CountryCode Actor1KnownGroupCode Actor1EthnicCode Actor1Religion1Cod Actor1Religion2Code Actor1Type1Code Actor1Type2Code Actor1Type3Code Actor2Code Actor2Name Actor2CountryCode Actor2KnownGroupCode Actor2EthnicCode Actor2Religion1Code Actor2Religion2Code Actor2Type1Code Actor2Type2Code Actor2Type3Code IsRootEvent EventCode EventBaseCode EventRootCode QuadClass GoldsteinScale NumMentions NumSources NumArticles AvgTone Actor1Geo_Type Actor1Geo_FullName Actor1Geo_CountryCode Actor1Geo_ADM1Code Actor1Geo_Lat Actor1Geo_Long Actor1Geo_FeatureID Actor2Geo_Type Actor2Geo_FullName Actor2Geo_CountryCode Actor2Geo_ADM1Code Actor2Geo_Lat Actor2Geo_Long Actor2Geo_FeatureID ActionGeo_Type ActionGeo_FullName ActionGeo_CountryCode ActionGeo_ADM1Code ActionGeo_Lat ActionGeo_Long ActionGeo_FeatureID DATEADDED SOURCEURL".split(" ")

if selection == 2:
    print("Downloading GDELT Events 1.0 files")

    start_date = datetime(2022, 11, 11)
    end_date = datetime(2023, 2, 7)

    def date_range(start, end):
        delta = end - start  # as timedelta
        days = [start + timedelta(days=i) for i in range(delta.days + 1)]
        return days

    days = [str(day.strftime("%Y%m%d")) for day in date_range(start_date, end_date)]

    for day in days:
        urllib.request.urlretrieve(
            "http://data.gdeltproject.org/events/" + day + ".export.CSV.zip",
            "gdelt_extraction/dump/" + "GEvents1" + day + ".zip",
        )

    print("Unzipping files")
    test_dir = os.listdir("gdelt_extraction/dump")
    for n in test_dir:
        with ZipFile("gdelt_extraction/dump/" + n, "r") as zipObj:
            zipObj.extractall("gdelt_extraction/dump")

    print("Adding headers")
    # process and save files
    test_dir = os.listdir("gdelt_extraction/dump")
    for p in test_dir:
        if ".CSV" in p:
            data_test = pd.read_csv(
                "gdelt_extraction/dump/" + p, delimiter="\t", names=header
            )
            data_test.to_csv("gdelt_extraction/results/" + p + "_processed" + ".csv")

    import glob

    # deletes the crap from the gdelt_extraction folder
    gdelt_extraction_dir = glob.glob("gdelt_extraction/dump/*")
    for f in gdelt_extraction_dir:
        os.remove(f)

    with open("gdelt_extraction/cameo_dict.csv", mode="r", encoding="utf-8") as inp:
        reader = csv.reader(inp)
        cameo_dict = {rows[0]: rows[2] for rows in reader}

    test_dir = os.listdir("gdelt_extraction/results")
    for f in test_dir:
        convert_dict = {"EventCode": str}

        df2 = pd.read_csv("gdelt_extraction/results/" + f)
        df2 = df2.astype(convert_dict)
        df2.replace({"EventCode": cameo_dict}, inplace=True)

        print(df2["EventCode"])
        df2.to_csv("gdelt_extraction/results_cleaned/" + f + "_coded" + ".csv")

    gdelt_extraction_dir = glob.glob("gdelt_extraction/results_cleaned/*")
    for f in gdelt_extraction_dir:
        os.remove(f)

if selection == 1:
    print("Downloading the latest file")

    yesterday = (date.today() - timedelta(1)).strftime("%Y%m%d")
    urllib.request.urlretrieve(
        "http://data.gdeltproject.org/events/" + yesterday + ".export.CSV.zip",
        "gdelt_extraction/dump/" + "GEvents" + yesterday + ".zip",
    )

    print("Unzipping files")
    with ZipFile("gdelt_extraction/dump/" + "GEvents" + yesterday + ".zip") as zipObj:
        zipObj.extractall("gdelt_extraction/dump")

    print("Adding headers")
    data_test = pd.read_csv(
        "gdelt_extraction/dump/" + "GEvents" + yesterday + ".zip",
        delimiter="\t",
        names=header,
    )
    data_test.to_csv(
        "gdelt_extraction/results/" + "GEvents" + yesterday + "_processed" + ".csv"
    )

print("Process completed!")


### Event analysis

In [None]:
# helper functions 
import json
import plotly.graph_objs as go
import plotly.express as px
import geopandas as gpd


def flat_ents1(ents):
    flat_ents = []
    for ent in ents:
        flat_ents = flat_ents + ent[1]

    return flat_ents

def top_n_bar_chart(data, n,var_name):
    x = [k[0] for k in data[0:n+1]]
    y = [k[1] for k in data[0:n+1]]
    data = [go.Bar(x=x, y=y)]
    layout = go.Layout(title=f"Top {n} "+var_name,template="plotly_dark" )
    fig = go.Figure(data=data, layout=layout)
    fig.show()

In [None]:

test_set = pd.read_json('output.json')
test_set.to_csv('piss.csv')

places = list(zip(list(range(len(test_set["Countries"].tolist()))),test_set["Countries"].tolist()))
# orgs = list(zip(list(range(len(test_set["Associated Organizations"].tolist()))),test_set["Associated Organizations"].tolist()))
titles = test_set['headline'].tolist()
# summmaries = test_set['Summary'].tolist()

all_places = flat_ents1(places)
places_ = {i:all_places.count(i) for i in all_places}
places_dict = sorted(places_.items(), key=lambda x:x[1], reverse=True)

all_orgs = flat_ents1(orgs)
orgs_ = {i:all_orgs.count(i) for i in all_orgs}
orgs_dict = sorted(orgs_.items(), key=lambda x:x[1], reverse=True)



def new_count(field,frame):
    indexed = list(zip(list(range(len(frame[field].tolist()))),frame[field].tolist()))
    all_ = flat_ents1(indexed)
    counted_ = {i:all_.count(i) for i in all_}
    counted = sorted(counted_.items(), key=lambda x:x[1], reverse=True)

    return counted




In [None]:
from IPython.display import display, Markdown

# 'Feed Name', 'Title', 'Date', 'Link', 'Summary', 'ID', 'Tags','Author Names', 'Countries', 'Associated Individuals','Associated Organizations', 'Key Phrases'
# top_n_bar_chart(orgs_dict, 25,'organizations')
# top_n_bar_chart(new_count('Associated Individuals',test_db), 25,'individuals')

top_ent = [k[0] for k in places_dict[0:26]]
s = test_set['Countries'].explode()=='Ukraine'
arts = test_set.loc[s[s].index]['Title'].tolist()

# for art in arts: 
#     display(Markdown(art))



poo