In [32]:
import os
import spacy
import pickle
import re

In [56]:
nlp_md = spacy.load('en_core_web_lg')
nlp_md.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x2a4e51680>

In [46]:
from spacy.util import compile_infix_regex
from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER
from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS

infixes = (
        LIST_ELLIPSES
        + LIST_ICONS
        + [
            r"(?<=[0-9])[+\-\*^](?=[0-9-])",
            r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
                al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
            ),
            r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
            # ✅ Commented out regex that splits on hyphens between letters:
            # r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
            r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
        ]
)

infix_re = compile_infix_regex(infixes)
nlp_md.tokenizer.infix_finditer = infix_re.finditer
doc = nlp_md("mother-in-law")
print([t.text for t in doc]) # ['mother-in-law']

['mother-in-law']


In [47]:
filenames = [f for f in os.listdir("../data/large_data/articles_spacydocs") if ".pickle" in f]
filenames[:10]

['spacydoc_pop_id_28.pickle',
 'spacydoc_pop_id_199.pickle',
 'spacydoc_pop_id_253.pickle',
 'spacydoc_pop_id_502.pickle',
 'spacydoc_pop_id_55.pickle',
 'spacydoc_pop_id_353.pickle',
 'spacydoc_pop_id_402.pickle',
 'spacydoc_pop_id_36.pickle',
 'spacydoc_pop_id_230.pickle',
 'spacydoc_pop_id_187.pickle']

In [48]:
sourcepath = "../data/large_data/articles_spacydocs/"

In [49]:
filepath = sourcepath + "spacydoc_pop_id_17.pickle"

In [50]:
with open(filepath, "rb") as f:
    doc = pickle.load(f)

In [51]:
str(doc).count("right-wing")

2

In [52]:
[(i, sent) for i, sent in enumerate(doc.sents) if re.search("right-wing", str(sent))]

[(78,
  As a set of ideas, populism can be attached to different ideologies, ranging from left-wing to right-wing and from progressive to conservative.),
 (123,
  Various studies have indeed found that populist parties positioned at the fringes of the political spectrum tend to express Eurosceptic attitudes (Hooghe et al., 2002; De Vries and Edwards, 2009; Halikiopoulou et al., 2012), and that supporters of such parties (both left- and right-wing) tend to be rather Eurosceptic as well (Lubbers and Scheepers, 2007; Werts et al., 2013; Visser et al., 2014; Ramiro, 2016).)]

In [55]:
raw_sent = str(list(doc.sents)[78])
raw_sent

'As a set of ideas, populism can be attached to different ideologies, ranging from left-wing to right-wing and from progressive to conservative.'

In [61]:
print([t.lemma_ for t in nlp_md(raw_sent)])

['as', 'a', 'set', 'of', 'idea', ',', 'populism', 'can', 'be', 'attach', 'to', 'different', 'ideology', ',', 'range', 'from', 'left', '-', 'wing', 'to', 'right', '-', 'wing', 'and', 'from', 'progressive', 'to', 'conservative', '.']


In [69]:
[t.lemma_ for t in nlp_md("He is a left-wing populist")]

['he', 'be', 'a', 'left', '-', 'wing', 'populist']

In [53]:
[(t.lemma_, t.pos_,) for t in nlp_md(str(list(doc.sents)[78]))]

[('as', 'ADP'),
 ('a', 'DET'),
 ('set', 'NOUN'),
 ('of', 'ADP'),
 ('idea', 'NOUN'),
 (',', 'PUNCT'),
 ('populism', 'NOUN'),
 ('can', 'AUX'),
 ('be', 'AUX'),
 ('attach', 'VERB'),
 ('to', 'ADP'),
 ('different', 'ADJ'),
 ('ideology', 'NOUN'),
 (',', 'PUNCT'),
 ('range', 'VERB'),
 ('from', 'ADP'),
 ('left-we', 'VERB'),
 ('to', 'ADP'),
 ('right-we', 'VERB'),
 ('and', 'CCONJ'),
 ('from', 'ADP'),
 ('progressive', 'ADJ'),
 ('to', 'ADP'),
 ('conservative', 'ADJ'),
 ('.', 'PUNCT')]

In [7]:
sent = [sent for sent in doc.sents][0]
sent

Various scholars have argued and demonstrated that Western European populist parties have something in common.

In [8]:
sent.ents

[Western European]

In [9]:
for ent in sent.ents:
    if ent.label_ == "PERSON":
        print(ent[-1].lemma_)

In [10]:
#ent = [ent for ent in sent.ents][0]

In [11]:
# postags = ["PROPN", "NOUN", "VERB", ]
def lemmata(spacyobject, filtered=True):
    if filtered==True:
        return  [t.lemma_ for t in spacyobject if (t.is_stop==False) & (t.is_alpha==True)]
    else:
        return  [t.lemma_ for t in spacyobject]
    # return [t.lemma_.lower() if t.pos=="PROPN" else t.lemma_ for t in spacyobject if (t.is_stop==False) & (t.is_alpha==True)]

In [12]:
lemmata(sent)

['scholar',
 'argue',
 'demonstrate',
 'western',
 'european',
 'populist',
 'party',
 'common']

In [66]:
[ent for ent in sent.ents]

[Western European]

In [67]:
ents_inds = []
entstart_dict = {}
for ent in sent.ents:
    ents_inds.extend(range(ent.start + 1, ent.end))
    entstart_dict[ent.start] = "_".join(lemmata(ent, filtered=False))

In [68]:
entstart_dict

{7: 'western_european'}

In [69]:
ents_inds

[8]

In [70]:
sent

Various scholars have argued and demonstrated that Western European populist parties have something in common.

In [71]:
def get_sentence_lemmata(sent):
    ents_inds = []
    entstart_dict = {}
    for ent in sent.ents:
        ents_inds.extend(range(ent.start + 1, ent.end))
        if ent.label_ == "PERSON":
            entstart_dict[ent.start] = ent[-1].lemma_
        else:
            entstart_dict[ent.start] = "_".join(lemmata(ent, filtered=False))
    sentence_lemmata = []
    for i in range(sent.start, sent.end):
        if i in entstart_dict.keys():
            sentence_lemmata.append(entstart_dict[i])
        else:
            if i not in ents_inds:
                t = sent[i - sent.start]
                if (t.is_stop==False) & (t.is_alpha==True):
                    sentence_lemmata.append(t.lemma_)
    return sentence_lemmata

In [72]:
sent

Various scholars have argued and demonstrated that Western European populist parties have something in common.

In [73]:
get_sentence_lemmata(sent)

['scholar',
 'argue',
 'demonstrate',
 'western_european',
 'populist',
 'party',
 'common']

In [74]:
lemmatized_sents = [" ".join(get_sentence_lemmata(sent)) for sent in doc.sents]
lemmatized_sents[:5]

['scholar argue demonstrate western_european populist party common',
 'party adhere ideology employ different organizational form political style endorse similar set idea concern relationship people elite',
 'despite increase knowledge party far know little populist voter',
 'voter basis populist party common',
 'answer question focus electorate 15 prototypical populist party 11 western_european country']

In [75]:
with open("testfile.txt", "w") as f:
    f.writelines("\n".join(lemmatized_sents))

In [76]:
!mkdir ../data/large_data/articles_lemmata

mkdir: ../data/large_data/articles_lemmata: File exists


In [77]:
filenames[0]

'spacydoc_pop_id_28.pickle'

In [78]:
sourcepath = "../data/large_data/articles_spacydocs/"
destpath = "../data/large_data/articles_lemmata/"
for filename in filenames:
    filepath = sourcepath + filename
    with open(filepath, "rb") as f:
        doc = pickle.load(f)
    lemmatized_sents = [" ".join(get_sentence_lemmata(sent)) for sent in doc.sents]
    with open(destpath + filename.replace("spacydoc", "lemmata").replace(".pickle", ".txt"), "w") as f:
        f.writelines("\n".join(lemmatized_sents))