In [1]:
import os
import spacy
import pickle
import re
from nltk.stem import WordNetLemmatizer

In [2]:
lemmatizer = WordNetLemmatizer()

In [3]:
nlp_md = spacy.load('en_core_web_lg')
nlp_md.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x2bd429280>

In [4]:
filenames = [f for f in os.listdir("../data/large_data/articles_spacydocs") if ".pickle" in f]
filenames[:10]

['spacydoc_pop_id_28.pickle',
 'spacydoc_pop_id_199.pickle',
 'spacydoc_pop_id_253.pickle',
 'spacydoc_pop_id_502.pickle',
 'spacydoc_pop_id_55.pickle',
 'spacydoc_pop_id_353.pickle',
 'spacydoc_pop_id_402.pickle',
 'spacydoc_pop_id_36.pickle',
 'spacydoc_pop_id_230.pickle',
 'spacydoc_pop_id_187.pickle']

In [5]:
sourcepath = "../data/large_data/articles_spacydocs/"

In [6]:
filepath = sourcepath + "spacydoc_pop_id_17.pickle"

In [7]:
#with open(filepath, "rb") as f:
#    doc = pickle.load(f)
# test reading it back
doc = pickle.load(open("../data/large_data/doc_test.pickle", "rb"))

In [8]:
str(doc).count("right-wing")

2

In [9]:
[(i, sent) for i, sent in enumerate(doc.sents) if re.search("right-wing", str(sent))]

[(75,
  As a set of ideas, populism can be attached to different ideologies, ranging from left-wing to right-wing and from progressive to conservative.),
 (114,
  Various studies have indeed found that populist parties positioned at the fringes of the political spectrum tend to express Eurosceptic attitudes (Hooghe et al., 2002; De Vries and Edwards, 2009; Halikiopoulou et al., 2012), and that supporters of such parties (both left- and right-wing) tend to be rather Eurosceptic as well (Lubbers and Scheepers, 2007; Werts et al., 2013; Visser et al., 2014; Ramiro, 2016).)]

In [10]:
sent = list(doc.sents)[75]
sent

As a set of ideas, populism can be attached to different ideologies, ranging from left-wing to right-wing and from progressive to conservative.

In [11]:
print([t.text for t in sent])

['As', 'a', 'set', 'of', 'ideas', ',', 'populism', 'can', 'be', 'attached', 'to', 'different', 'ideologies', ',', 'ranging', 'from', 'left-wing', 'to', 'right-wing', 'and', 'from', 'progressive', 'to', 'conservative', '.']


In [12]:
lemmata_list = []
for t in sent:
    if t.is_stop==False:
        if (t.is_alpha==True):
            lemmata_list.append(t.lemma_)
        else:
            if "-" in t.text:
                start, hyphen, end = str(t.text).rpartition("-")
                new_lemma = "-".join([start, lemmatizer.lemmatize(end)])
                lemmata_list.append(new_lemma)
lemmata_list

['set',
 'idea',
 'populism',
 'attach',
 'different',
 'ideology',
 'range',
 'left-wing',
 'right-wing',
 'progressive',
 'conservative']

In [13]:
for ent in sent.ents:
    if ent.label_ == "PERSON":
        print(ent[-1].lemma_)

In [14]:
#ent = [ent for ent in sent.ents][0]

In [15]:
# postags = ["PROPN", "NOUN", "VERB", ]
def lemmata(spacyobject, filtered=True):
    if filtered==True:
        lemmata_list = []
        for t in sent:
            if t.is_stop==False:
                if (t.is_alpha==True):
                    lemmata_list.append(t.lemma_)
                else:
                    if "-" in t.text:
                        start, hyphen, end = str(t.text).rpartition("-")
                        new_lemma = "-".join([start, lemmatizer.lemmatize(end)])
                        lemmata_list.append(new_lemma)
        return  lemmata_list
    else:
        return  [t.lemma_ for t in spacyobject]
    # return [t.lemma_.lower() if t.pos=="PROPN" else t.lemma_ for t in spacyobject if (t.is_stop==False) & (t.is_alpha==True)]

In [16]:
lemmata(sent)

['set',
 'idea',
 'populism',
 'attach',
 'different',
 'ideology',
 'range',
 'left-wing',
 'right-wing',
 'progressive',
 'conservative']

In [17]:
ents_inds = []
entstart_dict = {}
for ent in sent.ents:
    ents_inds.extend(range(ent.start + 1, ent.end))
    entstart_dict[ent.start] = "_".join(lemmata(ent, filtered=False))

In [18]:
entstart_dict

{}

In [19]:
ents_inds

[]

In [20]:
sent

As a set of ideas, populism can be attached to different ideologies, ranging from left-wing to right-wing and from progressive to conservative.

In [21]:
def get_sentence_lemmata(sent):
    ents_inds = []
    entstart_dict = {}
    for ent in sent.ents:
        ents_inds.extend(range(ent.start + 1, ent.end))
        if ent.label_ == "PERSON":
            entstart_dict[ent.start] = ent[-1].lemma_
        else:
            entstart_dict[ent.start] = "_".join(lemmata(ent, filtered=False))
    sentence_lemmata = []
    for i in range(sent.start, sent.end):
        if i in entstart_dict.keys():
            sentence_lemmata.append(entstart_dict[i])
        else:
            if i not in ents_inds:
                t = sent[i - sent.start]
                if t.is_stop==False:
                    if (t.is_alpha==True):
                        sentence_lemmata.append(t.lemma_)
                    else:
                        if "-" in t.text:
                            start, hyphen, end = str(t.text).rpartition("-")
                            new_lemma = "-".join([start, lemmatizer.lemmatize(end)])
                            sentence_lemmata.append(new_lemma)
    return sentence_lemmata

In [22]:
sent

As a set of ideas, populism can be attached to different ideologies, ranging from left-wing to right-wing and from progressive to conservative.

In [23]:
get_sentence_lemmata(sent)

['set',
 'idea',
 'populism',
 'attach',
 'different',
 'ideology',
 'range',
 'left-wing',
 'right-wing',
 'progressive',
 'conservative']

In [24]:
lemmatized_sents = [" ".join(get_sentence_lemmata(sent)) for sent in doc.sents]
lemmatized_sents[:5]

['scholar argue demonstrate western_european populist party common',
 'party adhere ideology employ different organizational form political style endorse similar set idea concern relationship people elite',
 'despite increase knowledge party far know little populist voter',
 'voter basis populist party common',
 'answer question focus electorate 15 prototypical populist party 11 western_european country']

In [139]:
with open("testfile.txt", "w") as f:
    f.writelines("\n".join(lemmatized_sents))

In [142]:
!mkdir ../data/large_data/articles_lemmata

mkdir: ../data/large_data/articles_lemmata: File exists


In [143]:
filenames[0]

'spacydoc_pop_id_28.pickle'

In [25]:
sourcepath = "../data/large_data/articles_spacydocs/"
destpath = "../data/large_data/articles_lemmata/"
for filename in filenames:
    filepath = sourcepath + filename
    with open(filepath, "rb") as f:
        doc = pickle.load(f)
    lemmatized_sents = [" ".join(get_sentence_lemmata(sent)) for sent in doc.sents]
    with open(destpath + filename.replace("spacydoc", "lemmata").replace(".pickle", ".txt"), "w") as f:
        f.writelines("\n".join(lemmatized_sents))