In [126]:
import os
import spacy
import pickle
import re
import nltk
from nltk.stem import WordNetLemmatizer

In [127]:
# using linked entities based on https://github.com/egerber/spaCy-entity-linker

In [162]:
raw_text = 'Hello, world. Here are two sentences.'
nlp_md = spacy.load('en_core_web_md')
nlp_md.add_pipe('sentencizer')
#nlp_md.add_pipe("entityLinker", last=True)

<spacy.pipeline.sentencizer.Sentencizer at 0x32a7bbf80>

In [129]:
from spacy.util import compile_infix_regex
from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER
from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS

infixes = (
        LIST_ELLIPSES
        + LIST_ICONS
        + [
            r"(?<=[0-9])[+\-\*^](?=[0-9-])",
            r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
                al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
            ),
            r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
            # ✅ Commented out regex that splits on hyphens between letters:
            # r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
            r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
        ]
)

infix_re = compile_infix_regex(infixes)
nlp_md.tokenizer.infix_finditer = infix_re.finditer
doc = nlp_md("mother-in-law")
print([t.text for t in doc]) # ['mother-in-law']

['mother-in-law']


In [130]:
doc = nlp_md(raw_text)
[[t.lemma_ for t in sent] for sent in doc.sents]

[['hello', ',', 'world', '.'], ['here', 'be', 'two', 'sentence', '.']]

In [131]:
[t.lemma_ for t in nlp_md("He is a left-wing populist")]

['he', 'be', 'a', 'left-we', 'populist']

In [132]:
# test with one document

In [133]:
pop_id = 17
path = "../data/large_data/articles_filteredtexts/"
file = "filteredtext_pop_id_{}.txt".format(str(pop_id))
filepath = path + file
filepath

'../data/large_data/articles_filteredtexts/filteredtext_pop_id_17.txt'

In [134]:
with open(filepath, "r") as f:
    filteredtext = f.read()
    filteredtext = filteredtext.replace("\n", " ")

In [135]:
doc = nlp_md(filteredtext)

In [136]:
doc.ents

(15,
 Department of Sociology,
 Utrecht University,
 Utrecht,
 Netherlands,
 Western European,
 15,
 11,
 Western European,
 Eurosceptic,
 Western European,
 one,
 the United Kingdom Independence Party,
 Britain,
 the Front National,
 National Front,
 France,
 the Partij voor de Vrijheid,
 Netherlands,
 Podemos,
 Spain,
 Germany,
 Five Star Movement,
 Italy,
 Weyland,
 2001,
 Mudde,
 2004,
 2002a,
 Albertazzi,
 McDonnell,
 2008,
 Akkerman et al.,
 2014,
 Pauwels,
 2011b,
 Mueller,
 2013,
 Mudde,
 Rovira Kaltwasser,
 2012,
 European Political Science Review,
 2018,
 10,
 351–368,
 © European Consortium for Political Research,
 First,
 20 June 2017,
 University of West Bohemia,
 08 Feb 2022,
 11,
 Stanley,
 2008,
 Van der Brug et al.,
 2000,
 2005,
 Lubbers et al.,
 2002,
 Ivarsﬂaten,
 2008,
 Zhirkov,
 2014,
 Scheepers,
 2007,
 Visser et al.,
 2014,
 Ramiro,
 2016,
 Mudde,
 2004,
 Stanley, 2008,
 Hawkins,
 2010,
 Rovira Kaltwasser,
 2012,
 Taggart,
 2000,
 Rooduijn et al.,
 2014,
 only o

In [137]:
[lent.span for lent in doc._.linkedEntities]

[bases,
 populist parties,
 electorates,
 Department,
 Sociology,
 Utrecht University,
 Utrecht,
 scholars,
 Netherlands,
 populist parties,
 parties,
 ideologies,
 forms,
 styles,
 set,
 ideas,
 relationship,
 people,
 elite,
 knowledge,
 parties,
 voters,
 bases,
 populist parties,
 question,
 electorates,
 populist parties,
 countries,
 contrast,
 beliefs,
 electorates,
 populist parties,
 individuals,
 losers,
 globalization,
 attitudes,
 levels,
 trust,
 preferences,
 direct democracy,
 voter,
 Keywords,
 populism,
 voting behavior,
 politics,
 explosion,
 studies,
 populism,
 fact,
 countries,
 part,
 world,
 upsurge,
 populist parties,
 United Kingdom Independence Party,
 UKIP,
 Britain,
 Front National,
 National Front,
 FN,
 France,
 Party,
 Freedom,
 PVV,
 Netherlands,
 Podemos,
 Spain,
 Linke,
 Left,
 Germany,
 Movimento Cinque Stelle,
 Five Star Movement,
 M5S,
 Italy,
 successes,
 parties,
 upsurge,
 Surel,
 Albertazzi,
 McDonnell,
 studies,
 populism,
 Weyland,
 parties,


In [138]:
[(i, sent) for i, sent in enumerate(doc.sents) if re.search("populis", str(sent))]

[(0, What unites the voter bases of populist parties?),
 (1,
  Comparing the electorates of 15 populist parties  Department of Sociology, Utrecht University, Utrecht, The Netherlands  Various scholars have argued and demonstrated that Western European populist parties have something in common.),
 (3,
  Yet despite our increasing knowledge about these parties, so far we know only very little about populist voters.),
 (4, Do the voter bases of populist parties also have something in common?),
 (5,
  To answer that question, I focus on the electorates of 15 prototypical populist parties from 11 Western European countries.),
 (6,
  I show that, in contrast with widely held beliefs, the electorates of populist parties do not always consist of individuals who are more likely to be ‘losers of globalization’ with Eurosceptic attitudes, low levels of political trust, and preferences for (more) direct democracy.),
 (7, This suggests that ‘the’ populist voter does not exist.  ),
 (8,
  Keywords: 

In [139]:
[(i, sent) for i, sent in enumerate(doc.sents) if re.search("right-wing", str(sent))]

[(72,
  As a set of ideas, populism can be attached to different ideologies, ranging from left-wing to right-wing and from progressive to conservative.),
 (108,
  Various studies have indeed found that populist parties positioned at the fringes of the political spectrum tend to express Eurosceptic attitudes (Hooghe et al., 2002; De Vries and Edwards, 2009; Halikiopoulou et al., 2012), and that supporters of such parties (both left- and right-wing) tend to be rather Eurosceptic as well (Lubbers and Scheepers, 2007; Werts et al., 2013; Visser et al., 2014; Ramiro, 2016).)]

In [140]:
sent = list(doc.sents)[75]
sent

If we aim at making inferences about what the electorates of populist parties in general have in common, however, relying on these studies will lead to ﬂawed conclusions for two main reasons: (1) these analyses are not inclusive enough because many other (non-radical) parties that might also express a populist message are excluded from the analysis; and (2) these analyses are contaminated by the speciﬁc ideological positions of the voters for these particular party families (e.g. anti-immigrant attitudes when it comes to radical right parties and attitudes toward welfare redistribution when it comes to radical left parties).  

# Filtering lemmata
(originally part of the next script...)

In [141]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [142]:
# postags = ["PROPN", "NOUN", "VERB", ]
def lemmata(spacyobject, filtered=True):
    if filtered==True:
        lemmata_list = []
        for t in spacyobject:
            if (t.is_stop==False) & (t.pos_ != "PROPN"):
                if (t.is_alpha==True):
                    lemmata_list.append(t.lemma_)
                else:
                    if "-" in t.text:
                        start, hyphen, end = str(t.text).rpartition("-")
                        new_lemma = "-".join([start, lemmatizer.lemmatize(end)]).lower()
                        if len(new_lemma) > 3:
                            lemmata_list.append(new_lemma)
        return  lemmata_list
    else:
        return  [t.lemma_ for t in spacyobject]
    # return [t.lemma_.lower() if t.pos=="PROPN" else t.lemma_ for t in spacyobject if (t.is_stop==False) & (t.is_alpha==True)]

In [143]:
sent

If we aim at making inferences about what the electorates of populist parties in general have in common, however, relying on these studies will lead to ﬂawed conclusions for two main reasons: (1) these analyses are not inclusive enough because many other (non-radical) parties that might also express a populist message are excluded from the analysis; and (2) these analyses are contaminated by the speciﬁc ideological positions of the voters for these particular party families (e.g. anti-immigrant attitudes when it comes to radical right parties and attitudes toward welfare redistribution when it comes to radical left parties).  

In [144]:
lemmata(sent)

['aim',
 'make',
 'inference',
 'electorate',
 'populist',
 'party',
 'general',
 'common',
 'rely',
 'study',
 'lead',
 'ﬂawed',
 'conclusion',
 'main',
 'reason',
 'analysis',
 'inclusive',
 'non-radical',
 'party',
 'express',
 'populist',
 'message',
 'exclude',
 'analysis',
 'analysis',
 'contaminate',
 'speciﬁc',
 'ideological',
 'position',
 'voter',
 'particular',
 'party',
 'family',
 'anti-immigrant',
 'attitude',
 'come',
 'radical',
 'right',
 'party',
 'attitude',
 'welfare',
 'redistribution',
 'come',
 'radical',
 'left',
 'party']

# apply to all documents

In [145]:
filenames = [f for f in os.listdir("../data/large_data/articles_filteredtexts") if ".txt" in f]
filenames[:10]

['filteredtext_pop_id_249.txt',
 'filteredtext_pop_id_261.txt',
 'filteredtext_pop_id_507.txt',
 'filteredtext_pop_id_513.txt',
 'filteredtext_pop_id_275.txt',
 'filteredtext_pop_id_18.txt',
 'filteredtext_pop_id_24.txt',
 'filteredtext_pop_id_30.txt',
 'filteredtext_pop_id_117.txt',
 'filteredtext_pop_id_103.txt']

In [146]:
destfiles_ready = [] # [f for f in os.listdir("../data/large_data/articles_lemmata_min") if ".txt" in f]
len(destfiles_ready)

0

In [147]:
destfiles_ready[:5]

[]

In [148]:
len(filenames)

539

In [149]:
#!mkdir ../data/large_data/articles_spacydocs
#!mkdir ../data/large_data/articles_lemmata_min

In [150]:
%%time
sourcepath = "../data/large_data/articles_filteredtexts/"
#destpath = "../data/large_data/articles_spacydocs/"
destpath = "../data/large_data/articles_lemmata_min/"

issues = []
for filename in filenames:
    destfile_name = filename.replace("filteredtext", "lemmata")
    if destfile_name not in destfiles_ready:
        try:
            filepath = sourcepath + filename
            with open(filepath, "r") as f:
                filteredtext = f.read()
            filteredtext = filteredtext.replace("\n", " ")
            doc = nlp_md(filteredtext)
            lemmatized_sents = [" ".join(lemmata(sent)) for sent in doc.sents]
            with open(destpath + destfile_name, "w") as f:
                f.writelines("\n".join(lemmatized_sents))
            print(destfile_name)
        except:
            issues.append(filename)

lemmata_pop_id_249.txt
lemmata_pop_id_261.txt
lemmata_pop_id_507.txt
lemmata_pop_id_513.txt
lemmata_pop_id_275.txt
lemmata_pop_id_18.txt
lemmata_pop_id_24.txt
lemmata_pop_id_30.txt
lemmata_pop_id_117.txt
lemmata_pop_id_103.txt
lemmata_pop_id_498.txt
lemmata_pop_id_329.txt
lemmata_pop_id_315.txt
lemmata_pop_id_473.txt
lemmata_pop_id_467.txt
lemmata_pop_id_301.txt
lemmata_pop_id_466.txt
lemmata_pop_id_300.txt
lemmata_pop_id_314.txt
lemmata_pop_id_472.txt
lemmata_pop_id_328.txt
lemmata_pop_id_499.txt
lemmata_pop_id_102.txt
lemmata_pop_id_116.txt
lemmata_pop_id_31.txt
lemmata_pop_id_25.txt
lemmata_pop_id_19.txt
lemmata_pop_id_512.txt
lemmata_pop_id_274.txt
lemmata_pop_id_260.txt
lemmata_pop_id_506.txt
lemmata_pop_id_248.txt
lemmata_pop_id_289.txt
lemmata_pop_id_538.txt
lemmata_pop_id_276.txt
lemmata_pop_id_510.txt
lemmata_pop_id_504.txt
lemmata_pop_id_262.txt
lemmata_pop_id_33.txt
lemmata_pop_id_27.txt
lemmata_pop_id_128.txt
lemmata_pop_id_100.txt
lemmata_pop_id_114.txt
lemmata_pop_id_458.

In [152]:
filename = issues[0]

In [163]:
filepath = sourcepath + filename
destfile_name = filename.replace("filteredtext", "lemmata")
with open(filepath, "r") as f:
    filteredtext = f.read()
filteredtext = filteredtext.replace("\n", " ")
doc = nlp_md(filteredtext)
lemmatized_sents = [" ".join(lemmata(sent)) for sent in doc.sents]
with open(destpath + destfile_name, "w") as f:
    f.writelines("\n".join(lemmatized_sents))
print(destfile_name)

lemmata_pop_id_468.txt
