In [2]:
import simplejson as json
import time
import smart_open
import re
import sys
import os

In [None]:
#!python -m gensim.scripts.segment_wiki -i -f enwiki-20200101-pages-articles-multistream1.xml-p10p30302.bz2 -o enwiki-latest.json.gz

In [3]:
RE_P0 = re.compile('<!--.*?-->', re.DOTALL | re.UNICODE)  # comments
RE_P1 = re.compile('<ref([> ].*?)(</ref>|/>)', re.DOTALL | re.UNICODE)  # footnotes
RE_P2 = re.compile("(\n\[\[[a-z][a-z][\w-]*:[^:\]]+\]\])+$", re.UNICODE)  # links to languages
RE_P3 = re.compile("{{([^}{]*)}}", re.DOTALL | re.UNICODE)  # template
RE_P4 = re.compile("{{([^}]*)}}", re.DOTALL | re.UNICODE)  # template
RE_P5 = re.compile('\[(\w+):\/\/(.*?)(( (.*?))|())\]', re.UNICODE)  # remove URL, keep description
RE_P6 = re.compile("\[([^][]*)\|([^][]*)\]", re.DOTALL | re.UNICODE)  # simplify links, keep description
RE_P7 = re.compile('\n\[\[[iI]mage(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE)  # keep description of images
RE_P8 = re.compile('\n\[\[[fF]ile(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE)  # keep description of files
RE_P9 = re.compile('<nowiki([> ].*?)(</nowiki>|/>)', re.DOTALL | re.UNICODE)  # outside links
RE_P10 = re.compile('<math([> ].*?)(</math>|/>)', re.DOTALL | re.UNICODE)  # math content
RE_P11 = re.compile('<(.*?)>', re.DOTALL | re.UNICODE)  # all other tags
RE_P12 = re.compile('\n(({\|)|(\|-)|(\|}))(.*?)(?=\n)', re.UNICODE)  # table formatting
RE_P13 = re.compile('\n(\||\!)(.*?\|)*([^|]*?)', re.UNICODE)  # table cell formatting
RE_P14 = re.compile('\[\[Category:[^][]*\]\]', re.UNICODE)  # categories
# Remove File and Image template
RE_P15 = re.compile('\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE)
RE_P16 = re.compile('=*\**', re.UNICODE)

def remove_markup(text):
    text = re.sub(RE_P2, "", text)  # remove the last list (=languages)
    iters = 0
    while True:
        old, iters = text, iters + 1
        text = re.sub(RE_P0, "", text)  # remove comments
        text = re.sub(RE_P1, '', text)  # remove footnotes
        text = re.sub(RE_P9, "", text)  # remove outside links
        text = re.sub(RE_P10, "", text)  # remove math content
        text = re.sub(RE_P11, "", text)  # remove all remaining tags
        text = re.sub(RE_P14, '', text)  # remove categories
        text = re.sub(RE_P5, '\\3', text)  # remove urls, keep description
        text = re.sub(RE_P6, '\\2', text)  # simplify links, keep description only
        # remove table markup
        text = text.replace('||', '\n|')  # each table cell on a separate line
        text = re.sub(RE_P12, '\n', text)  # remove formatting lines
        text = re.sub(RE_P13, '\n\\3', text)  # leave only cell content
        text = re.sub(RE_P16, '', text)  # leave only cell content
        # remove empty mark-up
        text = text.replace('[]', '')
        if old == text or iters > 2:  # stop if nothing changed between two iterations or after a fixed number of iterations
            break
    # the following is needed to make the tokenizer see '[[socialist]]s' as a single word 'socialists'
    text = text.replace('[', '').replace(']', '')  # promote all remaining markup to plain text
    return text

In [4]:
def save_article(title, text,  directory="./wikipedia/"):
    if not os.path.exists(directory):
        try:
            os.makedirs(directory)
        except OSError as e:
            print(e)
        path=os.path.join(directory, title.replace(" ","_") +'.txt')
        if os.path.exists(path):
            print('article already present')
        else:
            with open(path, 'w') as fp:
                fp.write(text)
    else:
        path=os.path.join(directory, title.replace(" ","_") +'.txt')
        if os.path.exists(path):
            print('article already present')
        else:
            with open(path, 'w') as fp:
                fp.write(text)
    return None

In [5]:
if __name__== "__main__":
    filename="./enwiki-latest.json.gz"
    start_time=time.time()
    count=0
    try:
        for line in smart_open.open(filename):
            count+=1
            article = json.loads(line)
            title = article['title']
            #print(title)
            text = remove_markup(" ".join(article['section_texts']))
            save_article(title, text)
    except FileNotFoundError as e:
         print("Wrong filename or file path")
    except Exception as e:
        pass
    print("Total articles: %d" %count)
    print("total time(in min): %f"  %((time.time()-start_time)/60))

Wrong filename or file path
Total articles: 0
total time(in min): 0.000004
