In [1]:
import pandas as pd
from sys import maxunicode
import unicodedata
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
import nltk
import os
import numpy as np

In [2]:
path_to_data = '/home/dafne/shared/FilterBubble/topic-modeling/Felicia-Archive/'

np_data = pd.read_csv(os.path.join(path_to_data, 'np_workfile.csv'))

In [3]:
np_data.columns

Index(['Hits', 'Score', 'ScorePercent', 'Filename', 'V3', 'Size', 'WordCt',
       'Title', 'headline', 'length', 'section', 'krant', 'month', 'year',
       'day', 'date', 'yq', 'ym', 'yw', 'file', 'v1', 'V8', 'V9', 'V10_1',
       'V10_2', 'V10_3', 'V10_4', 'V10_5', 'V11', 'V12', 'V13', 'V14', 'V16_1',
       'V16_2', 'V16_3', 'V16_4', 'V16_5', 'V17', 'V18', 'V19', 'V22', 'V23',
       'V24', 'V25A', 'V25B', 'V25C', 'V26', 'V27', 'V28', 'V30', 'V31', 'V32',
       'V34a', 'V34b', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42',
       'V43', 'V44', 'filename2', 'v9_major', 'dubbel', 'in'],
      dtype='object')

In [4]:
def extract_text(filename):
    with open(filename) as fi:
        text = fi.read().splitlines()
        text = list(filter(None,text))
        beginning = [i for i, s in enumerate(text) if 'LENGTH' in s or 'DATELINE' in s or 'LENGTE' in s]
        end = [i for i, s in enumerate(text) if 'LOAD-DATE' in s or 'LANGUAGE' in s]
        text = text[beginning[-1]+1:end[0]]
        text = ''.join(text)
    return text

In [5]:
tbl = dict.fromkeys(i for i in range(maxunicode) if unicodedata.category(chr(i)).startswith('P'))
stopwords_list = set(stopwords.words('dutch'))
stemmer=SnowballStemmer('dutch')

def preprocess_pipeline(text):
    # Remove interpunction
    text = text.replace(u"`",u"").replace(u"´",u"").translate(tbl)
    
    # Tokenize
    tokenized = word_tokenize(text)
    
    # Stem and remove stopwords
    stemmed = [stemmer.stem(w.lower()) for w in tokenized 
               if w.lower() not in stopwords_list and not (w.isalpha() and len(w)==1)]
    
    # Create bigrams - can be part of sklearn pipeline
#     text_bigrams = ["_".join(tup) for tup in nltk.ngrams(stemmed,2)]
#     text_final = stemmed + text_bigrams
#     text_final = ' '.join(text_final)
    text_final = ' '.join(stemmed)
    return text_final

In [6]:
# Clean datafile
df = np_data[np_data.v9_major != " "]
df = df[['V3', 'v9_major', 'date']]
df.columns = ['ID', 'topic', 'date']

In [7]:
df['text'] = [extract_text(os.path.join(path_to_data, 'articles', n)) for n in df['ID']]

In [8]:
df['text_prep'] = df.text.apply(preprocess_pipeline)

In [9]:
# These are taken from the pdf
topic_names = {
            1:'Macroeconomics',
            2: 'Civil rights and minority issues',
            3: 'Health',
            4: 'Agriculture',
            5: 'Labor and employment',
            6: 'Education',
            7: 'Environment',
            8: 'Energy',
            9: 'Immigration and integration',
            10: 'Transportation',
            12: 'Law and crime',
            13: 'Social welfare',
            14: 'Community development and housing',
            15: 'Banking, finance, and commerce',
            16: 'Defense',
            17: 'Science, technology, and communication',
            18: 'International trade',
            19: 'International affairs and foreign aid',
            20: 'Government operations',
            21: 'Spatial planning',
            23: 'Art, culture and entertainment',
            24: 'Local government',
            27: 'Weather and natural disasters',
            28: 'Fires and accidents',
            29: 'Sports and recreation',
            30: 'Obituary',
            31: 'Churches and religion',
            99: 'Other issue'
}

In [10]:
df['topic_name'] = [topic_names.get(int(i), np.nan) for i in df.topic]

In [11]:
df

Unnamed: 0,ID,topic,date,text,text_prep,topic_name
0,#100 @328419 +2073,99,1/16/1999,De andere wereld van zondagmorgen. Antropoloog...,wereld zondagmorg antropolog dr mattijs port d...,Other issue
1,#10000 @16311398 +11159,16,12/6/2008,"SAMENVATTINGDE SPEURTOCHT VAN EFRAIM ZUROFF, n...",samenvattingd speurtocht efraim zuroff nazijag...,Defense
2,#10004 @16327565 +1346,16,12/5/2008,Vol verwachting kloppen de hartjes van onze st...,vol verwacht klopp hartjes onz stoer mann oero...,Defense
3,#10004 @34719038 +2546,5,5/1/2001,Wie als nieuwkomer bij een bedrijf denkt dat h...,nieuwkomer bedrijf denkt flink salaris gestege...,Labor and employment
4,#10005 @37431676 +6256,3,5/7/2001,In ziekenhuizen in Hengelo en Leeuwarden wordt...,ziekenhuiz hengelo leeuward vandag gestaaktw w...,Health
...,...,...,...,...,...,...
12554,#9990 @37381637 +2880,1,5/11/2001,De consumentenprijzen in Nederland zijn in apr...,consumentenprijz nederland april jar gesteg 49...,Macroeconomics
12555,#9993 @16299444 +2081,29,12/7/2008,Ottman Bakkal is na rust ingevallen en schiet ...,ottman bakkal rust ingevall schiet psv binn dr...,Sports and recreation
12556,#9993 @34694786 +976,23,5/3/2001,BERLIJN - De vrouw van Hitlers propagandaminis...,berlijn vrouw hitler propagandaminister joseph...,"Art, culture and entertainment"
12557,#9996 @37403379 +2942,1,5/9/2001,De Rabobank verzet zich als enige grote bank i...,rabobank verzet enig grot bank nederland druk ...,Macroeconomics


In [12]:
# Remove data with empty text
df = df[df.text!='']

In [13]:
# Remove data with rare topics
df = df[~df.topic_name.isna()]

In [14]:
# Create train - val -test sets
from sklearn.model_selection import train_test_split
train, test = train_test_split(df.index, test_size=0.2, stratify=df['topic_name'], random_state=0)
df['sample'] = ''
df.loc[train, 'sample'] = 'train'
df.loc[test,'sample'] = 'test'

In [15]:
df.to_csv(os.path.join(path_to_data, 'preprocessed.csv'), index=False)