In [2]:
import re
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('../Data/data_Ackerman_LaJornada')

In [51]:
def html_cleaner(text):
    
    """Replaces or eliminates generic html expressions"""
    
    # Replace bold
    text = text.replace('<strong>', '<b>').replace('</strong>', '<\\b>')
    text = re.sub('<b>\s[\w\W]{3}>', '', text)
    
    # Replace italics
    text = text.replace('</em></em>', '</em>')
    text = re.sub(r'<em[\s\w\d":/{}=;\.!-]*>', '<i>', text)
    text = text.replace('</em>', '<\i> ')
    
    # Remove hyperlinks
    text = re.sub(r'</?a.*?(?=>)>', ' ', text)
            
    # Replace html spaces
    text = text.replace('\xa0', ' ')
    
    # Replace line breaks
    text = re.sub(r'<br/>', '<\n> ', text)
    text = re.sub(r'<\n>\s<\n>\s', '<\n\n> ', text)  
            
    return text

In [52]:
def ackerman_LaJornada_cleaner(text):
    
    """Cleans text of John Ackerman's articles"""
    
    text = html_cleaner(text)
    
    # Remove brackets
    text = re.sub('^\[', '', text)
    text = re.sub('\]$', '', text)
    
    # Remove paragraph format 
    #text = re.sub(r'<p><[!--\s\w:/{}]*></p>,?', '', text)
    text = re.sub(r'<p[\s\w="-:;]*?>', '', text)
    text = re.sub(r'</?span[!--\s\w:/{}=;\.]*>', '', text)
    
    #Remove multiple spaces
    text = re.sub(r'\s{2,}', ' ', text)
    text = re.sub(r'\s,\s', ', ', text)
    
    #Mark new paragraph
    text = re.sub(r'</p>,?[\s+]?', '<\n\n> ', text)
    
    # Remove new lines or spaces at beginning of text
    text = re.sub('^\s+', '', text)
    
    # Remove random html
    text = re.sub(r'<!--[\w\W]+-->', '', text)
    
    # Remove new lines at end of text
    text = re.sub(r'<\n\n>\s+\*?\s?$', '', text)
    text = re.sub(r'<\n\n>\s+(<\n\n>)?\s?(<\n>)?\s?$', '', text)
    text = re.sub(r'<br/>\s*$', '', text)
    
    return text

# Testing

In [57]:
patterns = ['<p>', '<\p>', '<!--', '<br/>', '\r\n', '<strong>', '<span', '<b> <\\b>']
for pattern in patterns:
    for i in range(len(data.body)):
        if pattern in ackerman_LaJornada_cleaner(data.body[i]):
            print(f'Article num: {i} | Pattern: {pattern}')

In [58]:
for i in range(len(data.body)):
    x = re.findall(r'[\w\W]{5}PATTERN[\w\W]{15}', ackerman_LaJornada_cleaner(data.body[i]))
    if x:
        print(f'{i}: {x}')

In [None]:
number = 4
print(data.link[number])
ackerman_LaJornada_cleaner(data.body[number])