In [2]:
import re
import pandas as pd
import pickle
import os

In [3]:
articles_metadata = pd.read_json("../data/article_metadata_with_filepaths.json")
articles_metadata.head(5)

Unnamed: 0,year,authors,journal,title,type,pdf?,keywords,keywords_online?,notes,filepath,pop_id
0,2011,Bos et al.,Political Communication,How the Media Shape Perceptions of Right-Wing ...,article,y,y,y,,[Political Communication/2011 - Bos et al. - H...,0
1,2019,Blassnig et al.,Political Communication,Hitting a Nerve: Populist News Articles Lead t...,article,y,y,y,,[Political Communication/2019 - Blassing et al...,1
2,2017,Caramani,American Political Science Review,Will vs. Reason: The Populist and Technocratic...,article,y,y,,,[American Political Science Review/2017 - Cara...,2
3,2020,"Wuttke,Schimpf, Schoen",American Political Science Review,When the Whole Is Greater than the Sum of Its ...,article,y,y,,,[American Political Science Review/2020 - Wutt...,3
4,2014,Treib,Journal of European Public Policy,"The voter says no, but nobody listens: causes ...",article,y,y,,,[Journal of European Public Policy/2014 - Trei...,4


In [4]:
articles_metadata = articles_metadata[articles_metadata["filepath"].apply(bool)].copy()

In [5]:
articles_sample = articles_metadata.sample(10, random_state=1)
articles_sample

Unnamed: 0,year,authors,journal,title,type,pdf?,keywords,keywords_online?,notes,filepath,pop_id
62,2019,Chamorel,Journal of Democracy,Macron versus the Yellow Vests,article,y,y,y,,[Journal of Democracy/2019 - Chamorel - Macron...,62
377,2019,Alberti,Comparative Politics,Populist Multiculturalism in the Andes: Balanc...,article,y,y,y,,[Comparative Politics/2019 - Alberti - Populis...,377
188,2019,"Brunkert, Kruse, Welzel",Democratization,A tale of culture-bound regime evolution: the ...,article,y,y,,,"[Democratization/2019 - Brunkert, Kruse, Welze...",188
364,2017,"Hawkins, Rovira Kaltwasser",Swiss Political Science Review,What the (Ideational) Study of Populism Can Te...,article,y,y,,,[Swiss Political Science Review/2017 - Hawkins...,364
499,2016,Read,Historical Materialism Research in Critical Ma...,Under Pressure,article,y,y,,,[Historical Materialism Research in Critical M...,499
228,2020,"Bolton, Pitts",British Politics,Liberalism and critical Marxism: a reply to Gl...,article,y,y,,,"[British Politics/2020 - Bolton, Pitts - Liber...",228
376,2018,Fish,Comparative Politics,What Has Russia Become?,article,y,y,y,,[Comparative Politics/2018 - Fish - What Has R...,376
90,2018,Rooduijn,European Journal of Political Research,State of the field: How to study populism and ...,article,y,y,,,[European Journal of Political Research/2018 -...,90
227,2016,Aughey et al.,British Politics,Symposium On Michael Kenny´s The Politics of e...,article,,y,y,,[British Politics/2016 - Aughey et al. - Sympo...,227
330,2015,Leconte,International Political Science Review,From pathology to mainstream phenomenon: Revie...,article,y,y,,,[International Political Science Review/2015 -...,330


In [6]:
sample_pop_ids = [62, 377, 188, 364, 499, 228, 376, 90, 227, 330]

In [5]:
# create folder for filtered texts
!mkdir ../data/large_data/articles_filteredtexts

In [44]:
path = "../data/large_data/articles_filteredtexts/"

In [45]:
filteredtext = "In the middle of 2011, both Turkey and Thailand held national elections,124 While Turkish premier Recep Tayyip Erdo¢gan claimed his third popular mandate, his self-exiled Thai doppelgänger Thaksin Shinawatra—whose parties have now won five general elections since 2001—saw his younger sister and stand-in Yingluck Shinawatra assume the premiership."

In [46]:
re.sub("([\.|\:|\?\,])(\d+)", r"\1", filteredtext)

'In the middle of 2011, both Turkey and Thailand held national elections, While Turkish premier Recep Tayyip Erdo¢gan claimed his third popular mandate, his self-exiled Thai doppelgänger Thaksin Shinawatra—whose parties have now won five general elections since 2001—saw his younger sister and stand-in Yingluck Shinawatra assume the premiership.'

In [47]:
spliters = ["References\n",
            "Bibliography\n",
            "Notes\n",
            "Acknowledgments\n"]
def text_cleaning(filteredtext):
    filteredtext_len = len(filteredtext)
    usable_spliters = []
    for spliter in spliters + [spliter.upper() for spliter in spliters]:
        if spliter in filteredtext:
            keeping_len = len(filteredtext.partition(spliter)[0])
            if keeping_len > filteredtext_len / 2:
                usable_spliters.append((keeping_len, spliter))
    if bool(usable_spliters):
        keeping_index = min(usable_spliters)[0]
        filteredtext = filteredtext[:keeping_index]
    filteredtext = filteredtext.replace("-\n", "")
    filteredtext = re.sub("([\.|\:|\?\,])(\d+)", r"\1", filteredtext)
    return filteredtext

In [50]:
def get_filteredtext(pop_id, pct=5, tofile=True):
    filename = "textblocks_pop_id_{}.pickle".format(str(pop_id))
    pages_back = pickle.load(open("../data/large_data/articles_textblocks/" + filename, "rb"))
    rawtext = ""
    for p in pages_back:
        for tb in p:
            rawtext += tb[4]
    rawtext_per_page = len(rawtext) / len(pages_back)
    filteredtext = ""
    end_here = False
    for p in pages_back:
        accepted_tbs = []
        last_included = False
        for tb in p:
            tb_text = tb[4]
            # main filtering condition:
            #if tb_text.count("\n") > 2:
            if last_included == False:
                if len(tb_text) >= rawtext_per_page * pct / 100:
                    accepted_tbs.append(tb_text)
                    last_included = True
            else:
                accepted_tbs.append(tb_text)
            if tb_text in spliters + [spliter.upper() for spliter in spliters]:
                end_here = True
                break
        accepted_tbs_reversed = []
        last_included = False
        for tb_text in reversed(accepted_tbs):
            if last_included == False:
                if len(tb_text) >= rawtext_per_page * pct / 100:
                    accepted_tbs_reversed.append(tb_text)
                    last_included = True
            else:
                accepted_tbs_reversed.append(tb_text)
        accepted_tbs_reversed = [tb for tb in accepted_tbs_reversed if tb[0] != "<"]
        filteredtext += " ".join(reversed(accepted_tbs_reversed))
        if end_here == True:
            break
    filteredtext = text_cleaning(filteredtext)
    if tofile:
        newfile = "filteredtext_pop_id_{}.txt".format(str(pop_id))
        with open(path + newfile, "w") as f:
            f.write(filteredtext)
    else:
        return filteredtext

In [51]:
get_filteredtext(28,tofile=False)

'Duncan McCargo is professor of Southeast Asian politics at the University of Leeds. His Tearing Apart the Land: Islam and Legitimacy in \nSouthern Thailand (2008) won the inaugural 2009 Bernard Schwartz \nBook Prize from the Asia Society. Ayºe Zarakol is assistant professor \nof politics at Washington and Lee University and the author of After \nDefeat: How the East Learned to Live with the West (2011).\n In the middle of 2011, both Turkey and Thailand held national elections. While Turkish premier Recep Tayyip Erdo¢gan claimed his third \npopular mandate, his self-exiled Thai doppelgänger Thaksin Shinawatra—whose parties have now won five general elections since 2001—\nsaw his younger sister and stand-in Yingluck Shinawatra assume the \npremiership. Virtually no observers, however, noticed the synchronicity \nof two politicians with similar governance styles, party organizations, \nand societal bases besting military-bureaucratic establishments on the \ntwo flanks of Asia. \nAt first

In [53]:
#!mkdir ../data/large_data/sample_filteredtext
for id in sample_pop_ids:
    filteredtext = get_filteredtext(id, tofile=False)
    sample_path = "../data/large_data/sample_filteredtext/"
    newfile = "filteredtext_pop_id_{}.txt".format(str(id))
    with open(sample_path + newfile, "w") as f:
        f.write(filteredtext)

In [43]:
path = "../data/large_data/articles_filteredtexts/"
for id in articles_metadata["pop_id"]:
    get_filteredtext(id)