In [1]:
import pandas as pd
from collections import defaultdict, Counter
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk import word_tokenize
from tqdm import tqdm
import string
from text_cleaning_functions import *
import pickle 
import time 

In [2]:
word_counter = defaultdict(lambda: 0, {})
unicode_dict = get_unicode_dict()
stoplist = stopwords.words('english') + list(string.punctuation)

In [3]:
def preprocess(text):
    return [word.lower() for word in word_tokenize(text) if word.lower() not in stoplist and not word.isdigit()]

def update_word_counter(row):
    words = preprocess(row)
    for word in words:
        word_counter[word] += 1
        
def replace_accents_fixed(row, unicode_dict):
    if pd.isna(row):
        return ""
    else:
        return replace_accents(row, unicode_dict)

In [60]:
i = 0
for chunk in pd.read_csv('../data/raw/enwiki_20190801.k_plaintext.csv', usecols = ['section_text'], chunksize=10 ** 6):
    start = time.time()
    chunk['section_text_cleaned'] = chunk['section_text'].apply(lambda i: replace_accents_fixed(i, unicode_dict))
    print("done replacing accents")
    chunk['section_text_cleaned'].apply(update_word_counter)
    with open("../data/word2idx/{}.pkl".format(i), "wb") as f:
        pickle.dump(dict(word_counter), f)    
    end = time.time()
    print(end - start)
    i += 1

done replacing accents
1106.7389302253723
done replacing accents
1105.7295877933502
done replacing accents
1102.3962733745575
done replacing accents
1112.406487941742
done replacing accents
1107.9391508102417
done replacing accents
1103.346455335617
done replacing accents
1110.977682352066
done replacing accents
1116.8484773635864
done replacing accents
1117.633111000061
done replacing accents
1112.000411272049
done replacing accents
1106.4941940307617
done replacing accents
1116.5472385883331
done replacing accents
1113.4465773105621
done replacing accents
1122.1570847034454
done replacing accents
1115.303544998169
done replacing accents
1110.44864320755
done replacing accents
1108.845039844513
done replacing accents
1114.3676717281342
done replacing accents
1115.32794713974
done replacing accents
1120.809470653534
done replacing accents
1106.1233072280884
done replacing accents
533.6614880561829


In [61]:
len(word_counter)

17252450

In [63]:
word_counter['obama']

55868

In [6]:
with open("../data/word2idx/{}.pkl".format(20), "rb") as f:
    word_counter = pickle.load(f)    

In [7]:
len(word_counter)

17010099

In [8]:
word_counter_counter = Counter(word_counter)

In [15]:
vocab = word_counter_counter.most_common(500000)
vocab = set([s[0].translate(str.maketrans('', '', string.punctuation)) for s in vocab 
             if len(s[0].translate(str.maketrans('', '', string.punctuation))) != 0])
idx = range(2, len(vocab)+2)
len(vocab)

445879

In [16]:
word2idx = dict(zip(vocab, idx))
word2idx['PAD'] = 0
word2idx['UNKNOWN'] = 1

In [17]:
len(word2idx)

445881

In [19]:
with open("../data/word2idx/word2idx.pkl", "wb") as f:
    pickle.dump(word2idx, f)    