In [1]:
import os
import spacy
import pickle
import re
import nltk
from nltk.stem import WordNetLemmatizer

In [2]:
# using linked entities based on https://github.com/egerber/spaCy-entity-linker

In [3]:
raw_text = 'Hello, world. Here are two sentences.'
nlp_md = spacy.load('en_core_web_md')
nlp_md.add_pipe('sentencizer')
nlp_md.add_pipe("entityLinker", last=True)

<spacy_entity_linker.EntityLinker.EntityLinker at 0x173234a30>

In [None]:
from spacy.util import compile_infix_regex
from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER
from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS

infixes = (
        LIST_ELLIPSES
        + LIST_ICONS
        + [
            r"(?<=[0-9])[+\-\*^](?=[0-9-])",
            r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
                al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
            ),
            r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
            # ✅ Commented out regex that splits on hyphens between letters:
            # r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
            r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
        ]
)

infix_re = compile_infix_regex(infixes)
nlp_md.tokenizer.infix_finditer = infix_re.finditer
doc = nlp_md("mother-in-law")
print([t.text for t in doc]) # ['mother-in-law']

In [4]:
doc = nlp_md(raw_text)
[[t.lemma_ for t in sent] for sent in doc.sents]

[['hello', ',', 'world', '.'], ['here', 'be', 'two', 'sentence', '.']]

In [5]:
[t.lemma_ for t in nlp_md("He is a left-wing populist")]


['he', 'be', 'a', 'left', '-', 'wing', 'populist']

In [6]:
# test with one document

In [7]:
pop_id = 17
path = "../data/large_data/articles_filteredtexts/"
file = "filteredtext_pop_id_{}.txt".format(str(pop_id))
filepath = path + file
filepath

'../data/large_data/articles_filteredtexts/filteredtext_pop_id_17.txt'

In [8]:
with open(filepath, "r") as f:
    filteredtext = f.read()
    filteredtext = filteredtext.replace("\n", " ")

In [9]:
doc = nlp_md(filteredtext)

In [10]:
doc.ents

(Western European,
 15,
 11,
 Western European,
 Eurosceptic,
 Western European,
 one,
 the United Kingdom Independence Party,
 Britain,
 the Front National (National Front,
 FN,
 France,
 Netherlands,
 Podemos,
 Spain,
 Die Linke,
 Germany,
 the Movimento Cinque Stelle,
 Five Star Movement,
 Italy,
 Weyland,
 2001,
 Mudde,
 2004,
 Mény,
 Surel,
 2002a,
 Albertazzi,
 McDonnell,
 2008,
 Akkerman et al.,
 2014,
 Pauwels,
 2011b,
 Mueller,
 2013,
 Mudde,
 Rovira Kaltwasser,
 2012,
 M.Rooduijn@uva.nl,
 European Political Science Review,
 2018,
 10,
 351–368,
 © European Consortium for Political Research,
 First,
 20 June 2017,
 351,
 University of West Bohemia,
 08 Feb 2022,
 11,
 Cambridge,
 Stanley, 2008,
 Van der Brug et al.,
 2000,
 2005,
 Lubbers et al.,
 2002,
 Ivarsﬂaten,
 2008,
 Zhirkov,
 2014,
 Lubbers,
 Scheepers,
 2007,
 Visser et al.,
 2014,
 Ramiro,
 2016,
 Mudde,
 2004,
 Stanley,
 2008,
 Hawkins,
 2010,
 Rovira Kaltwasser,
 2012,
 Taggart,
 2000,
 Rooduijn et al.,
 2014,
 onl

In [13]:
[lent.span for lent in doc._.linkedEntities]

[scholars,
 populist parties,
 parties,
 ideologies,
 forms,
 styles,
 set,
 ideas,
 relationship,
 people,
 elite,
 knowledge,
 parties,
 voters,
 bases,
 populist parties,
 question,
 electorates,
 populist parties,
 countries,
 contrast,
 beliefs,
 electorates,
 populist parties,
 individuals,
 losers,
 globalization,
 attitudes,
 levels,
 trust,
 preferences,
 direct democracy,
 voter,
 Keywords,
 populism,
 voting behavior,
 politics,
 explosion,
 studies,
 populism,
 fact,
 countries,
 part,
 world,
 upsurge,
 populist parties,
 United Kingdom Independence Party,
 UKIP,
 Britain,
 Front National,
 National Front,
 FN,
 France,
 Party,
 Freedom,
 PVV,
 Netherlands,
 Podemos,
 Spain,
 Die,
 Left,
 Germany,
 Stelle,
 Five Star Movement,
 M5S,
 Italy,
 successes,
 parties,
 studies,
 populism,
 parties,
 Weyland,
 upsurge,
 Surel,
 Albertazzi,
 McDonnell,
 public opinion,
 al,
 .,
 Pauwels,
 making,
 Albertazzi,
 Mueller,
 liberal democracy,
 Rovira,
 studies,
 assumption,
 populist 

In [127]:
[(i, sent) for i, sent in enumerate(doc.sents) if re.search("right-wing", str(sent))]

[(75,
  As a set of ideas, populism can be attached to different ideologies, ranging from left-wing to right-wing and from progressive to conservative.),
 (114,
  Various studies have indeed found that populist parties positioned at the fringes of the political spectrum tend to express Eurosceptic attitudes (Hooghe et al., 2002; De Vries and Edwards, 2009; Halikiopoulou et al., 2012), and that supporters of such parties (both left- and right-wing) tend to be rather Eurosceptic as well (Lubbers and Scheepers, 2007; Werts et al., 2013; Visser et al., 2014; Ramiro, 2016).)]

In [134]:
sent = list(doc.sents)[75]
sent

As a set of ideas, populism can be attached to different ideologies, ranging from left-wing to right-wing and from progressive to conservative.

In [135]:
[(t.lemma_, t.pos_) for t in sent]

[('as', 'ADP'),
 ('a', 'DET'),
 ('set', 'NOUN'),
 ('of', 'ADP'),
 ('idea', 'NOUN'),
 (',', 'PUNCT'),
 ('populism', 'NOUN'),
 ('can', 'AUX'),
 ('be', 'AUX'),
 ('attach', 'VERB'),
 ('to', 'ADP'),
 ('different', 'ADJ'),
 ('ideology', 'NOUN'),
 (',', 'PUNCT'),
 ('range', 'VERB'),
 ('from', 'ADP'),
 ('left-we', 'VERB'),
 ('to', 'ADP'),
 ('right-we', 'VERB'),
 ('and', 'CCONJ'),
 ('from', 'ADP'),
 ('progressive', 'ADJ'),
 ('to', 'ADP'),
 ('conservative', 'ADJ'),
 ('.', 'PUNCT')]

In [136]:
#lents_spans = []
#for lent in sent._.linkedEntities:
#    lents_spans.append(lent.get_label())

In [137]:
[[(t.lemma_, t.pos_) for t in sent] for sent in doc.sents][:5]

[[('various', 'ADJ'),
  ('scholar', 'NOUN'),
  ('have', 'AUX'),
  ('argue', 'VERB'),
  ('and', 'CCONJ'),
  ('demonstrate', 'VERB'),
  ('that', 'SCONJ'),
  ('western', 'ADJ'),
  ('european', 'ADJ'),
  ('populist', 'ADJ'),
  ('party', 'NOUN'),
  ('have', 'VERB'),
  ('something', 'PRON'),
  ('in', 'ADP'),
  ('common', 'ADJ'),
  ('.', 'PUNCT')],
 [('although', 'SCONJ'),
  ('these', 'DET'),
  ('party', 'NOUN'),
  ('adhere', 'VERB'),
  ('to', 'ADP'),
  ('various', 'ADJ'),
  ('ideology', 'NOUN'),
  ('and', 'CCONJ'),
  ('employ', 'VERB'),
  ('different', 'ADJ'),
  ('organizational', 'ADJ'),
  ('form', 'NOUN'),
  ('and', 'CCONJ'),
  ('political', 'ADJ'),
  ('style', 'NOUN'),
  (',', 'PUNCT'),
  ('they', 'PRON'),
  ('all', 'PRON'),
  ('endorse', 'VERB'),
  ('a', 'DET'),
  ('similar', 'ADJ'),
  ('set', 'NOUN'),
  ('of', 'ADP'),
  ('idea', 'NOUN'),
  ('concern', 'VERB'),
  ('the', 'DET'),
  ('relationship', 'NOUN'),
  ('between', 'ADP'),
  ('the', 'DET'),
  ('people', 'NOUN'),
  ('and', 'CCONJ'),


In [132]:
[[(t.vector) for t in sent] for sent in doc.sents][:5]

[[array([-1.67    , -4.9127  , -0.84737 ,  1.3156  ,  2.4377  , -1.1667  ,
          1.0973  ,  3.0059  , -3.7047  , -2.8609  ,  4.2986  ,  1.3732  ,
         -4.0969  , -0.24876 ,  5.9641  ,  2.2787  ,  3.9737  , -0.71887 ,
         -2.6188  , -1.342   ,  0.43703 , -0.46998 , -1.1056  , -1.0305  ,
         -0.4089  , -0.99468 , -2.0724  , -0.47373 ,  2.0011  ,  0.22033 ,
          3.9885  , -1.4898  , -3.291   , -0.69662 , -3.0899  , -4.1344  ,
         -2.1981  ,  2.302   , -0.48438 ,  0.5318  ,  3.2337  , -0.35581 ,
         -3.1644  ,  0.81368 , -2.7954  ,  2.2718  , -2.8636  , -0.89675 ,
         -0.6052  ,  2.5521  ,  0.46531 ,  4.0467  , -3.1201  , -4.9923  ,
         -3.155   ,  2.686   , -0.90523 ,  2.3489  ,  3.376   , -3.2777  ,
          0.9984  , -1.4317  , -0.40785 ,  1.1404  ,  4.3066  ,  1.6046  ,
         -3.4259  , -5.0004  ,  1.634   ,  0.34518 , -2.2142  , -2.5172  ,
         -1.7819  , -0.81433 ,  0.24194 ,  0.54691 , -0.42265 , -1.5985  ,
          0.1162  ,  2.70

In [138]:
pickle.dump(doc, open("../data/large_data/doc_test.pickle", "wb"))

In [139]:
# test reading it back
doc = pickle.load(open("../data/large_data/doc_test.pickle", "rb"))

# apply to all documents

In [141]:
filenames = [f for f in os.listdir("../data/large_data/articles_filteredtexts") if ".txt" in f]
filenames[:10]

['filteredtext_pop_id_249.txt',
 'filteredtext_pop_id_261.txt',
 'filteredtext_pop_id_507.txt',
 'filteredtext_pop_id_513.txt',
 'filteredtext_pop_id_275.txt',
 'filteredtext_pop_id_18.txt',
 'filteredtext_pop_id_24.txt',
 'filteredtext_pop_id_30.txt',
 'filteredtext_pop_id_117.txt',
 'filteredtext_pop_id_103.txt']

In [142]:
len(filenames)

537

In [143]:
#!mkdir ../data/large_data/articles_spacydocs

In [144]:
%%time
sourcepath = "../data/large_data/articles_filteredtexts/"
destpath = "../data/large_data/articles_spacydocs/"

for filename in filenames:
    filepath = sourcepath + filename
    with open(filepath, "r") as f:
        filteredtext = f.read()
    filteredtext = filteredtext.replace("\n", " ")
    doc = nlp_md(filteredtext)
    newfilename = filename.replace("filteredtext", "spacydoc").replace(".txt", ".pickle")
    pickle.dump(doc, open(destpath + newfilename, "wb"))

CPU times: user 20min 54s, sys: 1min 11s, total: 22min 5s
Wall time: 23min 32s
