Libraries upload

In [None]:
!pip install jellyfish
!pip install SPARQLWrapper

In [None]:
import pandas as pd
import numpy as np
import re
from jellyfish import jaro_similarity as jaro
from itertools import combinations
from SPARQLWrapper import SPARQLWrapper, JSON

Dataset read

In [None]:
df = pd.read_csv('with_extracted_pers_orgs.csv')

Additional cleaning of data: limiting news by a 10 year time frame

In [None]:
ten_years = df[df['date'] > '2009/12/13']

Saving extracted entities into a better format (on the previous step it simply saved the vocab of entities straight into the dataframe)

In [None]:
clean_person = []
clean_organization = []

for index, row in ten_years.iterrows():
    current = row
    current['persons'] = current['persons'].replace('{', '').replace(',', '').replace('"', '').replace('}', '')
    current['organizations'] = current['organizations'].replace('{', '').replace(',', '').replace('"', '').replace('}', '')

    pattern = re.compile(r'[^\']+')

    matches = re.findall(pattern, current['persons'])
    matches = [m for m in matches if m != ' ' and m != 'set()']
    clean_person.append('@'.join(matches))

    matches = re.findall(pattern, current['organizations'])
    matches = [m for m in matches if m != ' ' and m != 'set()']
    clean_organization.append('@'.join(matches))

Saving the edited dataframe

In [None]:
ten_years['persons'] = clean_person
ten_years['organizations'] = clean_organization
ten_years.to_csv('ten_years_with_good_format.csv', index = False)

Check the dataframe

In [None]:
ten_years

Function for testing if a pair of keywords is too similar and needs to be removed (control over spacy's issues with cases in Russian language)

In [None]:
def check_similarity(list_of_pairs):
    removed = set()
    to_add = set()

    for pair in list_of_pairs:
        el1 = pair[0]
        el2 = pair[1]

        #check length of the element - if length lower or equal to 3, then it is
        #likely to be an abbreviation, thus jaro similarity will be too sensitive
        if len(el1) > 3:
            if len(el2) > 3:
                #if jaro similarity is higher than 0.9, then keywords
                #are considered the same
                if jaro(el1, el2) >= 0.9:
                    #saving the keyword with the shortest length
                    if len(el1) < len(el2):
                        to_add.add(el1)
                        removed.add(el2)
                        if el2 in to_add:
                            to_add.remove(el2)
                    else:
                        to_add.add(el2)
                        removed.add(el1)
                        if el1 in to_add:
                            to_add.remove(el1)
                else:
                    if el1 not in removed:
                        to_add.add(el1)
                    if el2 not in removed:
                        to_add.add(el2)
            else:
                if el1 not in removed:
                    to_add.add(el1)
                if el2 not in removed:
                    to_add.add(el2)
        else:
            if el1 not in removed:
                to_add.add(el1)
            if el2 not in removed:
                to_add.add(el2)

    return to_add

Saving all key terms and their absolute number of occurences in the dataset into a special dictionary

In [None]:
all_terms = dict()

for index, row in ten_years.iterrows():
    orgs = row['organizations'].split('@')
    to_check = list(combinations(orgs, 2))
    to_add = check_similarity(to_check)

    for el in to_add:
        if el in all_terms:
            all_terms[el][0] += 1
        else:
            all_terms[el] = [1, 'org']

    pers = row['persons'].split('@')
    to_check = list(combinations(pers, 2))
    to_add = check_similarity(to_check)

    for el in to_add:
        if el in all_terms:
            all_terms[el][0] += 1
        else:
            all_terms[el] = [1, 'pers']

In [None]:
all_terms

If a term appears in the dataset more than 35 times, then it is considered significant enough to be analyzed at the later stages

In [None]:
cleaned_all_terms = dict()

for term in all_terms:
    #threshold 35 was taken after analyzing the outputs in all_terms
    if all_terms[term][0] >= 35:
        cleaned_all_terms[term] = all_terms[term]

Total number of keywords in the dataset

In [None]:
len(cleaned_all_terms)

Function for DBpedia link retrieval

In [None]:
def return_dbpedia_link(term, term_type):
    entity_type = ''

    if term_type == 'org':
        entity_type = '?s1 a <http://dbpedia.org/ontology/Organisation> .'
    else:
        entity_type = '?s1 a <http://dbpedia.org/ontology/Person> .'

    sparql = SPARQLWrapper("http://dbpedia.org/sparql")

    #if the entity is an organization or a person entity of only one word,
    #then proceed with the query
    if term_type == 'org' or (term_type == 'pers' and len(term.split()) == 1):
        query = f"""
     define input:ifp "IFP_OFF"  select ?s1 as ?c1, (bif:search_excerpt (bif:vector ('{term}'), ?o1)) as ?c2, ?sc, ?rank, ?g where {{{{ select ?s1, (?sc * 3e-1) as ?sc, ?o1, (sql:rnk_scale (<LONG::IRI_RANK> (?s1))) as ?rank, ?g where
  {{
    quad map virtrdf:DefaultQuadMap
    {{
      graph ?g
      {{
         ?s1 ?s1textp ?o1 .
        ?o1 bif:contains  '("{term}")'  option (score ?sc)  .

      }}
     }}
    {entity_type}
  }}
 order by desc (?sc * 3e-1 + sql:rnk_scale (<LONG::IRI_RANK> (?s1)))  limit 5  offset 0 }}}}
"""
    #otherwise the term to be sent needs to be preprocessed into the appropriate
    #format
    else:
        parts = term.split()
        first_input = ''
        for part in parts:
            first_input += '\'' + part + '\'' + ', '

        second_input = ''

        for part in parts:
            second_input += '\"' + part + '\"' + ' AND '

        first_input = first_input.rstrip(', ')
        second_input = second_input.rstrip(' AND ')

        query = f"""
     define input:ifp "IFP_OFF"  select ?s1 as ?c1, (bif:search_excerpt (bif:vector ({first_input}), ?o1)) as ?c2, ?sc, ?rank, ?g where {{{{ select ?s1, (?sc * 3e-1) as ?sc, ?o1, (sql:rnk_scale (<LONG::IRI_RANK> (?s1))) as ?rank, ?g where
  {{
    quad map virtrdf:DefaultQuadMap
    {{
      graph ?g
      {{
         ?s1 ?s1textp ?o1 .
        ?o1 bif:contains  '({second_input})'  option (score ?sc)  .

      }}
     }}
    {entity_type}
  }}
 order by desc (?sc * 3e-1 + sql:rnk_scale (<LONG::IRI_RANK> (?s1)))  limit 1  offset 0 }}}}
"""

    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    result = sparql.query().convert()

    #if link was not found, return the appropriate message,
    #otherwise return the top 1 link
    if result['results']['bindings'] == []:
        return 'link_not_found'
    else:
        return result['results']['bindings'][0]['c1']['value']

Extracting links from DBpedia

In [None]:
terms_with_links = []

for term in cleaned_all_terms:
    terms_with_links.append([term, cleaned_all_terms[term][0], cleaned_all_terms[term][1],
                             return_dbpedia_link(term, cleaned_all_terms[term][1])])

Saving all information about the terms

In [None]:
save_terms = pd.DataFrame(terms_with_links, columns = ['term', 'occurences', 'type', 'link'])
save_terms

In [None]:
save_terms.to_csv('extracted_terms_with_links.csv', index = False)