In [1]:
import os
import spacy
import pickle
import re
import nltk
from nltk.stem import WordNetLemmatizer

In [2]:
# using linked entities based on https://github.com/egerber/spaCy-entity-linker

In [3]:
raw_text = 'Hello, world. Here are two sentences.'
nlp_md = spacy.load('en_core_web_md')
nlp_md.add_pipe('sentencizer')
nlp_md.add_pipe("entityLinker", last=True)

<spacy_entity_linker.EntityLinker.EntityLinker at 0x2b231fee0>

In [4]:
from spacy.util import compile_infix_regex
from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER
from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS

infixes = (
        LIST_ELLIPSES
        + LIST_ICONS
        + [
            r"(?<=[0-9])[+\-\*^](?=[0-9-])",
            r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
                al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
            ),
            r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
            # ✅ Commented out regex that splits on hyphens between letters:
            # r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
            r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
        ]
)

infix_re = compile_infix_regex(infixes)
nlp_md.tokenizer.infix_finditer = infix_re.finditer
doc = nlp_md("mother-in-law")
print([t.text for t in doc]) # ['mother-in-law']

['mother-in-law']


In [5]:
doc = nlp_md(raw_text)
[[t.lemma_ for t in sent] for sent in doc.sents]

[['hello', ',', 'world', '.'], ['here', 'be', 'two', 'sentence', '.']]

In [8]:
[t.lemma_ for t in nlp_md("He is a left-wing populist")]

['he', 'be', 'a', 'left-we', 'populist']

In [9]:
# test with one document

In [10]:
pop_id = 17
path = "../data/large_data/articles_filteredtexts/"
file = "filteredtext_pop_id_{}.txt".format(str(pop_id))
filepath = path + file
filepath

'../data/large_data/articles_filteredtexts/filteredtext_pop_id_17.txt'

In [11]:
with open(filepath, "r") as f:
    filteredtext = f.read()
    filteredtext = filteredtext.replace("\n", " ")

In [12]:
doc = nlp_md(filteredtext)

In [13]:
doc.ents

(15,
 Utrecht University,
 Utrecht,
 Netherlands,
 Western European,
 15,
 11,
 Western European,
 Eurosceptic,
 Western European,
 one,
 the United Kingdom Independence Party,
 Britain,
 the Front National,
 National Front,
 France,
 the Partij voor de Vrijheid,
 Netherlands,
 Podemos,
 Spain,
 Germany,
 Five Star Movement,
 Italy,
 Weyland,
 2001,
 Mudde,
 2004,
 2002a,
 Albertazzi,
 McDonnell,
 2008,
 Akkerman et al.,
 2014,
 Pauwels,
 2011b,
 Mueller,
 2013,
 Mudde,
 Rovira Kaltwasser,
 2012,
 European Political Science Review,
 2018,
 10,
 351–368,
 © European Consortium for Political Research,
 First,
 20 June 2017,
 351,
 University of West Bohemia,
 08 Feb 2022,
 11,
 Stanley,
 2008,
 Van der Brug et al.,
 2000,
 2005,
 Lubbers et al.,
 2002,
 Ivarsﬂaten,
 2008,
 Zhirkov,
 2014,
 Scheepers,
 2007,
 Visser et al.,
 2014,
 Ramiro,
 2016,
 Mudde,
 2004,
 Stanley, 2008,
 Hawkins,
 2010,
 Rovira Kaltwasser,
 2012,
 Taggart,
 2000,
 Rooduijn et al.,
 2014,
 only one,
 Pauwels,
 2014,

In [14]:
[lent.span for lent in doc._.linkedEntities]

[bases,
 populist parties,
 electorates,
 populist parties,
 H,
 R,
 D,
 U,
 I,
 J,
 N,
 *,
 Sociology,
 Utrecht University,
 Utrecht,
 scholars,
 Netherlands,
 populist parties,
 parties,
 ideologies,
 forms,
 styles,
 set,
 ideas,
 relationship,
 people,
 elite,
 knowledge,
 parties,
 voters,
 bases,
 populist parties,
 question,
 electorates,
 populist parties,
 countries,
 contrast,
 beliefs,
 electorates,
 populist parties,
 individuals,
 losers,
 globalization,
 attitudes,
 levels,
 trust,
 preferences,
 direct democracy,
 voter,
 Keywords,
 populism,
 voting behavior,
 politics,
 explosion,
 studies,
 populism,
 fact,
 countries,
 part,
 world,
 upsurge,
 populist parties,
 United Kingdom Independence Party,
 UKIP,
 Britain,
 Front National,
 National Front,
 FN,
 France,
 Party,
 Freedom,
 PVV,
 Netherlands,
 Podemos,
 Spain,
 Linke,
 Left,
 Germany,
 Movimento Cinque Stelle,
 Five Star Movement,
 M5S,
 Italy,
 successes,
 parties,
 upsurge,
 Surel,
 Albertazzi,
 McDonnell,
 st

In [15]:
[(i, sent) for i, sent in enumerate(doc.sents) if re.search("populis", str(sent))]

[(0, What unites the voter bases of populist parties?),
 (1, Comparing the electorates of 15 populist parties  M A T T H),
 (6,
  I J N*  Department of Sociology, Utrecht University, Utrecht, The Netherlands  Various scholars have argued and demonstrated that Western European populist parties have something in common.),
 (8,
  Yet despite our increasing knowledge about these parties, so far we know only very little about populist voters.),
 (9, Do the voter bases of populist parties also have something in common?),
 (10,
  To answer that question, I focus on the electorates of 15 prototypical populist parties from 11 Western European countries.),
 (11,
  I show that, in contrast with widely held beliefs, the electorates of populist parties do not always consist of individuals who are more likely to be ‘losers of globalization’ with Eurosceptic attitudes, low levels of political trust, and preferences for (more) direct democracy.),
 (12, This suggests that ‘the’ populist voter does not 

In [16]:
[(i, sent) for i, sent in enumerate(doc.sents) if re.search("right-wing", str(sent))]

[(80,
  As a set of ideas, populism can be attached to different ideologies, ranging from left-wing to right-wing and from progressive to conservative.),
 (120,
  Various studies have indeed found that populist parties positioned at the fringes of the political spectrum tend to express Eurosceptic attitudes (Hooghe et al., 2002; De Vries and Edwards, 2009; Halikiopoulou et al., 2012), and that supporters of such parties (both left- and right-wing) tend to be rather Eurosceptic as well (Lubbers and Scheepers, 2007; Werts et al., 2013; Visser et al., 2014; Ramiro, 2016).)]

In [17]:
sent = list(doc.sents)[75]
sent

Populists are anti-elitist because they accuse the elite of being incompetent, arrogant, and/or selﬁsh (Canovan, 2002; Laclau, 2005; Barr, 2009).

# Filtering lemmata
(originally part of the next script...)

In [22]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [23]:
# postags = ["PROPN", "NOUN", "VERB", ]
def lemmata(spacyobject, filtered=True):
    if filtered==True:
        lemmata_list = []
        for t in spacyobject:
            if (t.is_stop==False) & (t.pos_ != "PROPN"):
                if (t.is_alpha==True):
                    lemmata_list.append(t.lemma_)
                else:
                    if "-" in t.text:
                        start, hyphen, end = str(t.text).rpartition("-")
                        new_lemma = "-".join([start, lemmatizer.lemmatize(end)]).lower()
                        lemmata_list.append(new_lemma)
        return  lemmata_list
    else:
        return  [t.lemma_ for t in spacyobject]
    # return [t.lemma_.lower() if t.pos=="PROPN" else t.lemma_ for t in spacyobject if (t.is_stop==False) & (t.is_alpha==True)]

In [26]:
sent

Populists are anti-elitist because they accuse the elite of being incompetent, arrogant, and/or selﬁsh (Canovan, 2002; Laclau, 2005; Barr, 2009).

In [25]:
lemmata(sent)

['populist',
 'anti-elitist',
 'accuse',
 'elite',
 'incompetent',
 'arrogant',
 'selﬁsh']

# apply to all documents

In [27]:
filenames = [f for f in os.listdir("../data/large_data/articles_filteredtexts") if ".txt" in f]
filenames[:10]

['filteredtext_pop_id_249.txt',
 'filteredtext_pop_id_261.txt',
 'filteredtext_pop_id_507.txt',
 'filteredtext_pop_id_513.txt',
 'filteredtext_pop_id_275.txt',
 'filteredtext_pop_id_18.txt',
 'filteredtext_pop_id_24.txt',
 'filteredtext_pop_id_30.txt',
 'filteredtext_pop_id_117.txt',
 'filteredtext_pop_id_103.txt']

In [33]:
destfiles_ready = [f for f in os.listdir("../data/large_data/articles_lemmata_min") if ".txt" in f]
len(destfiles_ready)

332

In [35]:
destfiles_ready[:5]

['lemmata_pop_id_277.txt',
 'lemmata_pop_id_511.txt',
 'lemmata_pop_id_505.txt',
 'lemmata_pop_id_263.txt',
 'lemmata_pop_id_539.txt']

In [36]:
len(filenames)

542

In [37]:
#!mkdir ../data/large_data/articles_spacydocs
#!mkdir ../data/large_data/articles_lemmata_min

In [38]:
%%time
sourcepath = "../data/large_data/articles_filteredtexts/"
#destpath = "../data/large_data/articles_spacydocs/"
destpath = "../data/large_data/articles_lemmata_min/"

issues = []
for filename in filenames:
    destfile_name = filename.replace("filteredtext", "lemmata")
    if destfile_name not in destfiles_ready:
        try:
            filepath = sourcepath + filename
            with open(filepath, "r") as f:
                filteredtext = f.read()
            filteredtext = filteredtext.replace("\n", " ")
            doc = nlp_md(filteredtext)
            lemmatized_sents = [" ".join(lemmata(sent)) for sent in doc.sents]
            with open(destpath + destfile_name, "w") as f:
                f.writelines("\n".join(lemmatized_sents))
            print(destfile_name)
        except:
            issues.append(filename)

lemmata_pop_id_48.txt
lemmata_pop_id_184.txt
lemmata_pop_id_190.txt
lemmata_pop_id_147.txt
lemmata_pop_id_153.txt
lemmata_pop_id_386.txt
lemmata_pop_id_392.txt
lemmata_pop_id_345.txt
lemmata_pop_id_423.txt
lemmata_pop_id_437.txt
lemmata_pop_id_351.txt
lemmata_pop_id_379.txt
lemmata_pop_id_378.txt
lemmata_pop_id_436.txt
lemmata_pop_id_350.txt
lemmata_pop_id_344.txt
lemmata_pop_id_422.txt
lemmata_pop_id_393.txt
lemmata_pop_id_387.txt
lemmata_pop_id_152.txt
lemmata_pop_id_146.txt
lemmata_pop_id_191.txt
lemmata_pop_id_185.txt
lemmata_pop_id_49.txt
lemmata_pop_id_61.txt
lemmata_pop_id_75.txt
lemmata_pop_id_218.txt
lemmata_pop_id_224.txt
lemmata_pop_id_230.txt
lemmata_pop_id_226.txt
lemmata_pop_id_540.txt
lemmata_pop_id_232.txt
lemmata_pop_id_88.txt
lemmata_pop_id_63.txt
lemmata_pop_id_77.txt
lemmata_pop_id_193.txt
lemmata_pop_id_187.txt
lemmata_pop_id_150.txt
lemmata_pop_id_144.txt
lemmata_pop_id_178.txt
lemmata_pop_id_391.txt
lemmata_pop_id_385.txt
lemmata_pop_id_352.txt
lemmata_pop_id_434

In [39]:
len(issues)

2

In [40]:
issues

['filteredtext_pop_id_60.txt', 'filteredtext_pop_id_469.txt']

In [43]:
filename = issues[0]
filepath = sourcepath + filename
with open(filepath, "r") as f:
    filteredtext = f.read()
filteredtext = filteredtext.replace("\n", " ")
#doc = nlp_md(filteredtext)
#lemmatized_sents = [" ".join(lemmata(sent)) for sent in doc.sents]
#with open(destpath + destfile_name, "w") as f:
#    f.writelines("\n".join(lemmatized_sents))


In [44]:
filteredtext

'Journal of Democracy, Volume 30, Number 4, October 2019, pp. 35-47 (Article)  Published by Johns Hopkins University Press DOI:  For additional information about this article  Access provided at 15 Oct 2019 08: GMT with no institutional affiliation  https://doi.org/10./jod.. Anna Grzymala-Busse is Michelle and Kevin Douglas Professor of  International Studies at Stanford University and senior fellow at Stanford’s Freeman Spogli Institute for International Studies. She is author, most recently, of Nations Under God: How Churches Use Moral  Authority to Influence Policy (2015).  Populists are everywhere. They win elections and found successful  new political parties; they are the subject of hand-wringing by newspaper columnists and pundits; and they are the focus of a cottage industry  of academic experts and analysts that mushroomed after the 2016 Brexit  vote and the election of U.S. president Donald Trump. After a wave of  support for populists swept across Europe in the 2010s, analys

In [47]:
filteredtext

'Journal of Democracy, Volume 30, Number 4, October 2019, pp. 35-47 (Article)  Published by Johns Hopkins University Press DOI:  For additional information about this article  Access provided at 15 Oct 2019 08: GMT with no institutional affiliation  https://doi.org/10./jod.. Anna Grzymala-Busse is Michelle and Kevin Douglas Professor of  International Studies at Stanford University and senior fellow at Stanford’s Freeman Spogli Institute for International Studies. She is author, most recently, of Nations Under God: How Churches Use Moral  Authority to Influence Policy (2015).  Populists are everywhere. They win elections and found successful  new political parties; they are the subject of hand-wringing by newspaper columnists and pundits; and they are the focus of a cottage industry  of academic experts and analysts that mushroomed after the 2016 Brexit  vote and the election of U.S. president Donald Trump. After a wave of  support for populists swept across Europe in the 2010s, analys

In [45]:
doc = nlp_md(filteredtext)

IndexError: list index out of range