In [1]:
import os
import spacy
import pickle

In [2]:
filenames = [f for f in os.listdir("../data/large_data/articles_spacydocs") if ".pickle" in f]
filenames[:10]

['spacydoc_pop_id_28.pickle',
 'spacydoc_pop_id_199.pickle',
 'spacydoc_pop_id_253.pickle',
 'spacydoc_pop_id_502.pickle',
 'spacydoc_pop_id_55.pickle',
 'spacydoc_pop_id_353.pickle',
 'spacydoc_pop_id_402.pickle',
 'spacydoc_pop_id_36.pickle',
 'spacydoc_pop_id_230.pickle',
 'spacydoc_pop_id_187.pickle']

In [3]:
sourcepath = "../data/large_data/articles_spacydocs/"

In [13]:
filepath = sourcepath + filenames[0]

In [14]:
with open(filepath, "rb") as f:
    doc = pickle.load(f)

In [15]:
doc

Duncan McCargo is professor of Southeast Asian politics at the University of Leeds. His Tearing Apart the Land: Islam and Legitimacy in Southern Thailand (2008) won the inaugural 2009 Bernard Schwartz Book Prize from the Asia Society. Ayºe Zarakol is assistant professor of politics at Washington and Lee University and the author of After Defeat: How the East Learned to Live with the West (2011). In the middle of 2011, both Turkey and Thailand held national elections.1 While Turkish premier Recep Tayyip Erdo¢gan claimed his third popular mandate, his self-exiled Thai doppelgänger Thaksin Shinawatra—whose parties have now won five general elections since 2001—saw his younger sister and stand-in Yingluck Shinawatra assume the premiership. Virtually no observers, however, noticed the synchronicity of two politicians with similar governance styles, party organizations, and societal bases besting military-bureaucratic establishments on the two flanks of Asia. At first glance, the two countri

In [16]:
sent = [sent for sent in doc.sents][3]
sent

In the middle of 2011, both Turkey and Thailand held national elections.1 While Turkish premier Recep Tayyip Erdo¢gan claimed his third popular mandate, his self-exiled Thai doppelgänger Thaksin Shinawatra—whose parties have now won five general elections since 2001—saw his younger sister and stand-in Yingluck Shinawatra assume the premiership.

In [17]:
sent.ents

[the middle of 2011,
 Turkey,
 Thailand,
 Turkish,
 Recep Tayyip Erdo¢gan,
 third,
 Thai,
 Thaksin Shinawatra,
 five,
 2001,
 Yingluck Shinawatra]

In [18]:
ent = [ent for ent in sent.ents][0]

In [19]:
[t.lemma_ for t in ent]

['the', 'middle', 'of', '2011']

In [20]:
# postags = ["PROPN", "NOUN", "VERB", ]
def lemmata(spacyobject, filtered=True):
    if filtered==True:
        return  [t.lemma_ for t in spacyobject if (t.is_stop==False) & (t.is_alpha==True)]
    else:
        return  [t.lemma_ for t in spacyobject]
    # return [t.lemma_.lower() if t.pos=="PROPN" else t.lemma_ for t in spacyobject if (t.is_stop==False) & (t.is_alpha==True)]

In [24]:
lemmata(sent)

['middle',
 'Turkey',
 'Thailand',
 'hold',
 'national',
 'turkish',
 'premier',
 'Recep',
 'Tayyip',
 'claim',
 'popular',
 'mandate',
 'self',
 'exile',
 'Thai',
 'doppelgänger',
 'Thaksin',
 'Shinawatra',
 'party',
 'win',
 'general',
 'election',
 'see',
 'young',
 'sister',
 'stand',
 'Yingluck',
 'Shinawatra',
 'assume',
 'premiership']

In [25]:
[ent for ent in sent.ents]

[the middle of 2011,
 Turkey,
 Thailand,
 Turkish,
 Recep Tayyip Erdo¢gan,
 third,
 Thai,
 Thaksin Shinawatra,
 five,
 2001,
 Yingluck Shinawatra]

In [26]:
[[ent.start, ent.end, "_".join(lemmata(ent, filtered=False))] for ent in sent.ents]

[[75, 79, 'the_middle_of_2011'],
 [81, 82, 'Turkey'],
 [83, 84, 'Thailand'],
 [88, 89, 'turkish'],
 [90, 93, 'Recep_Tayyip_Erdo¢gan'],
 [95, 96, 'third'],
 [103, 104, 'Thai'],
 [105, 107, 'Thaksin_Shinawatra'],
 [113, 114, 'five'],
 [117, 118, '2001'],
 [127, 129, 'Yingluck_Shinawatra']]

In [27]:
lemmatized_sents = [lemmata(sent) for sent in doc.sents]

In [28]:
print(lemmatized_sents)

[['Duncan', 'McCargo', 'professor', 'southeast', 'asian', 'politic', 'University', 'Leeds'], ['Tearing', 'apart', 'land', 'Islam', 'Legitimacy', 'Southern', 'Thailand', 'win', 'inaugural', 'Bernard', 'Schwartz', 'Book', 'Prize', 'Asia', 'Society'], ['Ayºe', 'Zarakol', 'assistant', 'professor', 'politic', 'Washington', 'Lee', 'University', 'author', 'Defeat', 'East', 'Learned', 'live', 'West'], ['middle', 'Turkey', 'Thailand', 'hold', 'national', 'turkish', 'premier', 'Recep', 'Tayyip', 'claim', 'popular', 'mandate', 'self', 'exile', 'Thai', 'doppelgänger', 'Thaksin', 'Shinawatra', 'party', 'win', 'general', 'election', 'see', 'young', 'sister', 'stand', 'Yingluck', 'Shinawatra', 'assume', 'premiership'], ['virtually', 'observer', 'notice', 'synchronicity', 'politician', 'similar', 'governance', 'style', 'party', 'organization', 'societal', 'basis', 'best', 'military', 'bureaucratic', 'establishment', 'flank', 'Asia'], ['glance', 'country', 'appear', 'strikingly', 'different'], ['Turkey

In [30]:
for sent  in lemmatized_sents:
    if "populist" in sent:
        print(sent)

['stereotype', 'AKP', 'voter', 'prevail', 'old', 'line', 'turkish', 'establishment', 'portray', 'call', 'belly', 'scratcher', 'barely', 'educate', 'provincial', 'know', 'traditional', 'insincere', 'religious', 'value', 'minor', 'business', 'acuman', 'put', 'self', 'interest', 'need', 'reality', 'AKP', 'like', 'Thaksin', 'party', 'draw', 'support', 'number', 'social', 'group', 'mean', 'message', 'mix', 'claim', 'business', 'friendly', 'managerial', 'acuman', 'populist', 'promise', 'upward', 'mobility', 'new', 'entrepreneur', 'hinterland', 'urban', 'villager', 'city']
['go', 'road', 'ahead', 'necessarily', 'promise', 'lead', 'gradual', 'consolidation', 'liberal', 'democratic', 'norm', 'possibility', 'detour', 'populist', 'authoritarianism', 'loom']


In [40]:
!mkdir ../data/large_data/articles_lemmata

In [None]:
def lemmata_ents(spacyobject):
    return lemmata(spacyobject)

In [None]:
sourcepath = "../data/large_data/articles_spacydocs/"
for filename in filenames:
    filepath = sourcepath + filenames[0]
    with open(filepath, "rb") as f:
        doc = pickle.load(f)
    lemmatized_sents = [[" ".join(lemmata_ents(sent)) for sent in doc.sents]


