# Major script stash

In [3]:
# Import everything
import pandas as pd
import os
import urllib.request
from zipfile import ZipFile
from datetime import datetime, timedelta, date
import csv
import pandas as pd
from datetime import datetime
import spacy
from dateparser import parse

### Event extraction

In [4]:
nlp = spacy.load("en_core_web_trf")

def extract_countries(article):
    doc = nlp(article)
    countries = []
    
    for token in doc:
        if token.ent_type_ == "GPE":
            countries.append(token.text)
    
    return list(set(countries))

def extract_individuals(article):
    doc = nlp(article)
    individuals = []
    
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            individuals.append(ent.text)
    
    return list(set(individuals))

def extract_organizations(article):
    doc = nlp(article)
    organizations = []
    
    for ent in doc.ents:
        if ent.label_ == "ORG":
            organizations.append(ent.text)
    
    return list(set(organizations))

def extract_location(article):
    doc = nlp(article)
    location = []
    
    for ent in doc.ents:
        if ent.label_ == "LOC":
            location.append(ent.text)
    
    return list(set(location))

def extract_date(article):
    doc = nlp(article)
    date = None
    
    for token in doc:
        if token.ent_type_ == "DATE":
            parsed_date = parse(token.text)
            date = parsed_date.strftime("%Y-%m-%d") if parsed_date else None
            break
    
    return date

def extract_description(article, countries):
    description = ""
    
    if countries:
        start_index = article.index(countries[0])
        end_index = article.index(".", start_index) + 1
        description = article[start_index:end_index]
        description = description[:400]
    
    return description

def generate_event_name(countries, individuals, organizations, date):
    name = ""
    
    if countries:
        name += ", ".join(countries)
    
    if individuals or organizations:
        if name:
            name += " - "
        if individuals:
            name += ", ".join(individuals)
        if organizations:
            if individuals:
                name += ", "
            name += ", ".join(organizations)
    
    if date:
        if name:
            name += " - "
        name += date
    
    return name


In [None]:

def extract_event(article):
    countries = extract_countries(article)
    individuals = extract_individuals(article)
    organizations = extract_organizations(article)
    location = extract_location(article)
    date = extract_date(article)
    description = extract_description(article, countries)
    name = generate_event_name(countries, individuals, organizations, date)
    
    current_event = {
        "countries_involved": countries,
        "associated_individuals": individuals,
        "associated_organizations": organizations,
        "location": location,
        "date": date,
        "description": description,
        "name": name
    }
    
    return current_event
# Example usage
article = "ANKARA, May 28 (Reuters) - President Tayyip Erdogan extended his two decades in power in elections on Sunday, winning a mandate to pursue increasingly authoritarian policies which have polarised Turkey and strengthened its position as a regional military power. His challenger, Kemal Kilicdaroglu, called it the most unfair election in years but did not dispute the outcome. Official results showed Kilicdaroglu won 47.9 of the votes to Erdogan's 52.1%, pointing to a deeply divided nation. The election had been seen as one of the most consequential yet for Turkey, with the opposition believing it had a strong chance of unseating Erdogan and reversing his policies after his popularity was hit by a cost-of-living crisis. Instead, victory reinforced his image of invincibility, after he had already redrawn domestic, economic, security and foreign policy in the NATO member country of 85 million people. The prospect of five more years of his rule was a major blow to opponents who accused him of undermining democracy as he amassed ever more power - a charge he denies. In a victory speech in Ankara, Erdogan pledged to leave all disputes behind and unite behind national values and dreams but then switched gears, lashing out at the opposition and accusing Kilicdaroglu of siding with terrorists without providing evidence. He said releasing former pro-Kurdish party leader Selahattin Demirtas, whom he branded a terrorist, would not be possible under his governance. Erdogan said inflation was Turkey's most urgent issue. Kilicdaroglu's defeat will likely be mourned by Turkey's NATO allies which have been alarmed by Erdogan's ties to Russian President Vladimir Putin, who congratulated his dear friend on his victory. U.S. President Joe Biden wrote on Twitter: I look forward to continuing to work together as NATO Allies on bilateral issues and shared global challenges. U.S. relations with Turkey have been impeded by Erdogan's objection to Sweden joining NATO as well as Ankara's close relationship with Moscow and differences over Syria. 'THE ONLY WINNER TODAY IS TURKEY,' ERDOGAN SAYS Addressing jubilant supporters earlier from atop a bus in Istanbul, Erdogan, 69, said the only winner today is Turkey. I thank every single one of our people who once again gave us the responsibility to govern the country five more years, he said. Erdogan's victory extends his tenure as the longest-serving leader since Mustafa Kemal Ataturk established modern Turkey from the ruins of the Ottoman Empire a century ago - a politically potent anniversary to be marked in October with Erdogan in charge. Erdogan, head of the Islamist-rooted AK Party, appealed to voters with nationalist and conservative rhetoric during a divisive campaign that deflected attention from deep economic troubles. In his victory speech, he attacked the opposition again, calling them pro-LGBT. [1/14] REUTERS/Umit Bektas Kilicdaroglu, who had promised to set the country on a more democratic and collaborative path, said the vote showed people's will to change an authoritarian government. All the means of the state were laid at the feet of one man, he said. 'SAD AND DISAPPOINTED' Erdogan supporters, who gathered outside his Istanbul residence, chanted Allahu Akbar, or God is Greatest. I expect everything to become better, said Nisa, 28, a headscarved woman wearing a headband with Erdogan's name. Another Erdogan supporter said Turkey would get stronger with him in office for five more years. There are issues, problems in every country around the world, in European countries as well ... With strong leadership we will overcome Turkey's problems as well, said the supporter who gave his name as Mert, 39, as he celebrated with his son. Bugra Oztug, 24, who voted for Kilicdaroglu, blamed the opposition for failing to change. I feel sad and disappointed but I am not hopeless. I still think there are people who can see the realities and truth, Oztug said. Erdogan's performance has wrong-footed opponents who thought voters would punish him over the state's initially slow response to devastating earthquakes in February, in which more than 50,000 people died. But in the first round of voting on May 14, which included parliamentary elections, his AK Party emerged top in 10 of the 11 provinces hit by the earthquakes, helping it to secure a parliamentary majority along with its allies. FEARS FOR LIBERTIES French President Emmanuel Macron offered congratulations, saying France and Turkey had huge challenges to face together. The presidents of Iran, Israel, and the Saudi king were among leaders to congratulate him in the Middle East, where Erdogan has asserted Turkish influence, at times with military power. Erdogan, who was for years at odds with numerous governments in the region, has taken a more conciliatory stance in recent years. Emre Erdogan, a political science professor at Istanbul's Bilgi University, attributed Erdogan's success to his supporters' belief in his ability to solve problems, even though he created many of them. Erdogan had also maintained the support of conservative voters who long felt marginalised. This era will be characterized by a decline in political and civil liberties, polarization, and cultural fights between two political tribes, he said. Erdogan appeared to have prevailed despite years of economic turmoil which critics blamed on unorthodox economic policies which the opposition had pledged to reverse. Uncertainty about what an Erdogan win would mean for economic policy pushed the lira to record lows last week. Reuters reported last week that there was disagreement within Erdogans government over whether to stick with what some called an unsustainable economic programme or to abandon it. Kilicdaroglu had promised to reset governance, restore human rights, and return independence to the courts and central bank after they were sidelined over the last decade."

event = extract_event(article)
# print("Description:", event.get("description"))
print("Countries involved:", event.get("countries_involved"))
print("Date:", event.get("date"))
print("Associated individuals:", event.get("associated_individuals"))
print("Associated organizations:", event.get("associated_organizations"))

In [None]:
## Extract text from news articles 
articles = []
headers = ['Countries involved', 'Date', 'Associated Individuals',
           'Associated Organizations', 'Countries Involved', 'Location', 'url' ]

def eventlist(articles):
    events = pd.dataframe(names=headers)
    for article in articles:
        extract_event(article)

    return events



### GDELT Downloader

In [None]:


selection = int(input("1) get the latest file 2) get a date range"))
header = "GLOBALEVENTID SQLDATE MonthYear Year FractionDate Actor1Code Actor1Name Actor1CountryCode Actor1KnownGroupCode Actor1EthnicCode Actor1Religion1Cod Actor1Religion2Code Actor1Type1Code Actor1Type2Code Actor1Type3Code Actor2Code Actor2Name Actor2CountryCode Actor2KnownGroupCode Actor2EthnicCode Actor2Religion1Code Actor2Religion2Code Actor2Type1Code Actor2Type2Code Actor2Type3Code IsRootEvent EventCode EventBaseCode EventRootCode QuadClass GoldsteinScale NumMentions NumSources NumArticles AvgTone Actor1Geo_Type Actor1Geo_FullName Actor1Geo_CountryCode Actor1Geo_ADM1Code Actor1Geo_Lat Actor1Geo_Long Actor1Geo_FeatureID Actor2Geo_Type Actor2Geo_FullName Actor2Geo_CountryCode Actor2Geo_ADM1Code Actor2Geo_Lat Actor2Geo_Long Actor2Geo_FeatureID ActionGeo_Type ActionGeo_FullName ActionGeo_CountryCode ActionGeo_ADM1Code ActionGeo_Lat ActionGeo_Long ActionGeo_FeatureID DATEADDED SOURCEURL".split(" ")

if selection == 2:
    print("Downloading GDELT Events 1.0 files")

    start_date = datetime(2022, 11, 11)
    end_date = datetime(2023, 2, 7)

    def date_range(start, end):
        delta = end - start  # as timedelta
        days = [start + timedelta(days=i) for i in range(delta.days + 1)]
        return days

    days = [str(day.strftime("%Y%m%d")) for day in date_range(start_date, end_date)]

    for day in days:
        urllib.request.urlretrieve(
            "http://data.gdeltproject.org/events/" + day + ".export.CSV.zip",
            "gdelt_extraction/dump/" + "GEvents1" + day + ".zip",
        )

    print("Unzipping files")
    test_dir = os.listdir("gdelt_extraction/dump")
    for n in test_dir:
        with ZipFile("gdelt_extraction/dump/" + n, "r") as zipObj:
            zipObj.extractall("gdelt_extraction/dump")

    print("Adding headers")
    # process and save files
    test_dir = os.listdir("gdelt_extraction/dump")
    for p in test_dir:
        if ".CSV" in p:
            data_test = pd.read_csv(
                "gdelt_extraction/dump/" + p, delimiter="\t", names=header
            )
            data_test.to_csv("gdelt_extraction/results/" + p + "_processed" + ".csv")

    import glob

    # deletes the crap from the gdelt_extraction folder
    gdelt_extraction_dir = glob.glob("gdelt_extraction/dump/*")
    for f in gdelt_extraction_dir:
        os.remove(f)

    with open("gdelt_extraction/cameo_dict.csv", mode="r", encoding="utf-8") as inp:
        reader = csv.reader(inp)
        cameo_dict = {rows[0]: rows[2] for rows in reader}

    test_dir = os.listdir("gdelt_extraction/results")
    for f in test_dir:
        convert_dict = {"EventCode": str}

        df2 = pd.read_csv("gdelt_extraction/results/" + f)
        df2 = df2.astype(convert_dict)
        df2.replace({"EventCode": cameo_dict}, inplace=True)

        print(df2["EventCode"])
        df2.to_csv("gdelt_extraction/results_cleaned/" + f + "_coded" + ".csv")

    gdelt_extraction_dir = glob.glob("gdelt_extraction/results_cleaned/*")
    for f in gdelt_extraction_dir:
        os.remove(f)

if selection == 1:
    print("Downloading the latest file")

    yesterday = (date.today() - timedelta(1)).strftime("%Y%m%d")
    urllib.request.urlretrieve(
        "http://data.gdeltproject.org/events/" + yesterday + ".export.CSV.zip",
        "gdelt_extraction/dump/" + "GEvents" + yesterday + ".zip",
    )

    print("Unzipping files")
    with ZipFile("gdelt_extraction/dump/" + "GEvents" + yesterday + ".zip") as zipObj:
        zipObj.extractall("gdelt_extraction/dump")

    print("Adding headers")
    data_test = pd.read_csv(
        "gdelt_extraction/dump/" + "GEvents" + yesterday + ".zip",
        delimiter="\t",
        names=header,
    )
    data_test.to_csv(
        "gdelt_extraction/results/" + "GEvents" + yesterday + "_processed" + ".csv"
    )

print("Process completed!")


poo
