In [1]:
import locale
import re
import pandas as pd
import numpy as np
from os import listdir
from datetime import datetime

# Date cleaning

In [2]:
locale.setlocale(locale.LC_ALL, 'es_ES')

'es_ES'

In [3]:
def get_files(route):
    """This function will read all the files containing opinion pieces from this repository and """
    main_df = pd.DataFrame()
    files = listdir(route)
    for file in files:
        df = pd.read_csv(route + '/' + file)
        main_df = pd.concat([main_df, df], axis = 0)
    return main_df[['author','title','date','body','source','link']]

In [4]:
opi_art = get_files('../Data/Data_raw')

In [5]:
def date_cleaning(row):
    """This function will bring all the scraped dates to the format YYYY/MM/DD"""    
    if row['source']=='Milenio':
        if len(str(row['date'])) == 16:
            day = str(row['date'])[0:2]
            month = str(row['date'])[3:5]
            year = str(row['date'])[6:10]
            return f'{year}/{month}/{day}'
        else:
            return 'Check_Milenio'
        
    elif (row['source']=='El Universal') or (row['source']=='RT') or (row['source']=='El Financiero'):
        if len(str(row['date'])) == 10:
            day = str(row['date'])[0:2]
            month = str(row['date'])[3:5]
            year = str(row['date'])[6:10]
            return f'{year}/{month}/{day}'
        else:
            return 'Check_Universal_RT'
    
    elif (row['source']=='Personal website') and (row['author']=='Denisse Dresser'):
        date = row['date'].replace(',','').replace('de ','')
        date = datetime.strptime(date, '%d %B %Y').strftime('%Y/%m/%d')
        return date

    elif (row['source']=='Personal website') and (row['author']=='Enrique Krauze'):
        date = row['date'].replace(',','').replace('de ','')
        date = datetime.strptime(date, '%d %B %Y').strftime('%Y/%m/%d')
        return date
    
    elif (row['source']=='La Jornada') and (row['author']=='John Ackerman'):
        date = row['date'].replace('º','').replace('La Jornada, ','').replace('de ','').lower()
        date = date.replace('publicada el ','').replace('.','').replace(',','')
        date = datetime.strptime(date, '%d %B %Y').strftime('%Y/%m/%d')
        return date
    
    else:    
        return np.nan

In [6]:
opi_art['date']=opi_art.apply(lambda row:date_cleaning(row), axis = 1) 
opi_art.dropna(inplace = True)
opi_art

Unnamed: 0,author,title,date,body,source,link
0,Enrique Krauze,El jurista bondadoso,2021/03/08,"[<p><!-- wp:paragraph --></p>, <p>La justicia ...",Personal website,https://enriquekrauze.com.mx/el-jurista-bondad...
1,Enrique Krauze,La hambruna recordada,2021/02/22,"[<p><!-- wp:paragraph --></p>, <p>Toda revoluc...",Personal website,https://enriquekrauze.com.mx/la-hambruna-recor...
2,Enrique Krauze,Una historia de Covid,2021/02/08,"[<p><!-- wp:paragraph --></p>, <p>Martha creci...",Personal website,https://enriquekrauze.com.mx/una-historia-de-c...
3,Enrique Krauze,El espejo de Weimar,2021/01/25,"[<p><!-- wp:paragraph --></p>, <p>Toda democra...",Personal website,https://enriquekrauze.com.mx/el-espejo-de-weimar/
4,Enrique Krauze,Contra el despotismo,2021/01/11,"[<p><!-- wp:paragraph --></p>, <p>Ningún país ...",Personal website,https://enriquekrauze.com.mx/contra-el-despoti...
...,...,...,...,...,...,...
122,John Ackerman,"¿Cómo se atreven a poner en cuestión la ""democ...",2016/10/24,En los Estados Unidos no hay elección directa ...,RT,https://actualidad.rt.com/opinion/john-ackerma...
123,John Ackerman,¿Cuáles son las intenciones reales de los zapa...,2016/10/18,¿Tiene la candidatura presidencial del Ejércit...,RT,https://actualidad.rt.com/opinion/john-ackerma...
124,John Ackerman,"Ackerman: ""Clinton sería aún peor que Obama pa...",2016/10/12,La aspirante demócrata a la Presidencia de EE....,RT,https://actualidad.rt.com/opinion/john-ackerma...
125,John Ackerman,"¿Dejaremos a México en manos de los militares,...",2016/10/04,Los vacíos en la política se llenan. Este fin ...,RT,https://actualidad.rt.com/opinion/john-ackerma...


# Text cleaning

In [7]:
def cleaning_financiero(body):
    """This function will clean all the special characters or tags used on the texts scrapped from the web site of 'El Financiero'"""
    body=body.replace('\xa0','')
    body=body.replace('\r', '')
    body=body.replace('1   ,','')
    body=body.replace('1\xa0 \xa0,','')
    body=body.replace('\xa0\xa0 \xa0  ,','') 
    body = body.replace(r'(*) Datos tomados de @David_S_Kaplan. La interpretación es responsabilidad de la autora.','')
    body = re.sub(r'[\*\s,]{0,}La autora[\w\W]*$','',body).strip()
    body = re.sub(r'Valeria Moy es profesora[\w\W]*$','',body).strip()
    body = re.sub(r',Twitter:@[\w\W]*$','',body).strip()
    body = re.sub(r'Twitter:@[\w\W]*$','',body).strip()
    body = re.sub(r',Twitter: @[\w\W]*$','',body).strip()
    body = re.sub(r'Twitter: @[\w\W]*$','',body).strip()
    body=body.replace('\n\n', ' \n\n ')
    return body.strip()

In [8]:
def cleaning_universal(body):
    """This function will clean all the special characters or tags used on the texts scrapped from the web site of 'El Universal'"""
    body=body.replace('\xa0','')
    body=body.replace('\n', ' \n\n ')
    body=body.replace('[email\xa0protected]','')
    body=body.replace('www.ricardoraphael.com','')
    return body.strip()

In [9]:
def cleaning_milenio(body):
    """This function will clean all the special characters or tags used on the texts scrapped from the web site of 'Milenio'"""
    body=body.replace('*Directora del IMCO ','')
    body=body.replace('\xa0','')
    body=body.replace('\n\n\n\n\n\nEsa población estrenó el confinamiento y la transformación de la vida. Tingshu Wang/Reuters\n\n\n\n\n\n\u200b\n','')
    body=body.replace('Te recomendamos...\n\n\n\n\n\n\n\n\n\n\n\n\nSuspenden audiencia de detención de Israel Vallarta; acudiría Carlos Loret de Mola\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nDeseo que García Luna tenga debido proceso: abogado de Florence Cassez\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nFlorence Cassez volverá a México para "pedir cuentas" a Calderón y García Luna\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nEx pareja de Florence Cassez es dado de alta por covid-19\n\n\n\n\n\n\u200b\n','')
    body=body.replace('\n\nNota personal: Con esta colaboración termino un ciclo en MILENIO. Agradezco a los lectores, a mis compañeros de diario y a Francisco González por estos años de fructífera relación. Gracias.\n\n','')
    body=body.replace('u200b','')
    body=body.replace('\n\n', ' \n\n ')
    return body.strip()

In [10]:
def html_cleaner(text):
    
    """Replaces or eliminates generic html expressions"""
    
    # Replace bold
    text = text.replace('<strong>', ' <b> ').replace('</strong>', ' <\\b> ')
    text = re.sub('<b>\s[\w\W]{3}>', '', text)
    
    # Replace italics
    text = text.replace('</em></em>', '</em>')
    text = re.sub(r'<em[\s\w\d":/{}=;\.!-]*>', ' <i> ', text)
    text = text.replace('</em>', ' <\i> ')
    
    # Remove hyperlinks
    text = re.sub(r'</?a.*?(?=>)>', ' ', text)
            
    # Replace html spaces
    text = text.replace('\xa0', ' ')
    
    # Replace line breaks
    text = re.sub(r'<br/>', ' \n ', text)
    text = re.sub(r'<\n>\s<\n>\s', '  \n\n  ', text)  
            
    return text

In [11]:
def ackerman_LaJornada_cleaner(text):
    
    """Cleans text of John Ackerman's articles"""
    
    text = html_cleaner(text)
    
    # Remove brackets
    text = re.sub('^\[', '', text)
    text = re.sub('\]$', '', text)
    
    # Remove paragraph format 
    #text = re.sub(r'<p><[!--\s\w:/{}]*></p>,?', '', text)
    text = re.sub(r'<p[\s\w="-:;]*?>', '', text)
    text = re.sub(r'</?span[!--\s\w:/{}=;\.]*>', '', text)
    
    #Remove multiple spaces
    text = re.sub(r'\s{2,}', ' ', text)
    text = re.sub(r'\s,\s', ', ', text)
    
    #Mark new paragraph
    text = re.sub(r'</p>,?[\s+]?', ' \n\n ', text)
    
    # Remove new lines or spaces at beginning of text
    text = re.sub('^\s+', '', text)
    
    # Remove random html
    text = re.sub(r'<!--[\w\W]+-->', '', text)
    
    # Remove new lines at end of text
    text = re.sub(r'<\n\n>\s+\*?\s?$', '', text)
    text = re.sub(r'<\n\n>\s+(<\n\n>)?\s?(<\n>)?\s?$', '', text)
    text = re.sub(r'<br/>\s*$', '', text)
    
    return text

In [12]:
def dresser_cleaner(text):
    
    """Cleans text of Denisse Dresser's articles"""
    
    text = html_cleaner(text)
    text = text.replace('</p>, <p>', ' \n\n ')
    text = text.replace('[<p>', '')
    text = text.replace('</p>]', '')
    text = text.replace('<\n\n> —<\n\n>', ' \n\n ')
    
    #Remove final footer
    pattern = '<\n\n> Gracias por visitar[\w\W]*$'
    saludo = re.findall(pattern, text)
    if saludo:
        text = text.replace(saludo[0], '')
    
    return text

In [13]:
def krauze_cleaner(text):
    
    """Cleans text of Enrique Krauze's articles"""
    
    text = html_cleaner(text)
    
    # Remove brackets
    text = re.sub('^\[', '', text)
    text = re.sub('\]$', '', text)
    
    # Remove paragraph format 
    text = re.sub(r'<p><[!--\s\w:/{}]*></p>,?', '', text)
    text = re.sub(r'<p[\s\w="-:;]*?>', '', text)
    text = re.sub(r'</?span[!--\s\w:/{}=;\.]*>', '', text)
    
    #Remove multiple spaces
    text = re.sub(r'\s{2,}', ' ', text)
    text = re.sub(r'\s,\s', ', ', text)
    
    #Mark new paragraph
    text = re.sub(r'</p>,?[\s+]?', ' \n\n ', text)
    
    # Remove new lines or spaces at beginning of text
    text = re.sub('^\s+', '', text)
    
    # Remove publishing media
    text = re.sub('(<?i?>?|<?b?r?>?|\(?)Publicado[\w\W]+$', '', text)
    text = re.sub('\(?(Artículo|Texto) publicado[\w\W]+$', '', text)
    text = re.sub('<?i?>?Una versión (de\seste\stexto|más\samplia|de\seste\s)[\w\W]+$', '', text)
    text = re.sub('<\n\n>\s<i>Reforma[\w\W]+$', '', text)
    text = re.sub('<\n\n>\s<i>El\sNorte[\w\W]+$', '', text)
    text = re.sub('<\n\n>\s<?i?>?Proceso[\w\W]*$', '', text)
    text = re.sub('<\n\n>\s<?i?>El País[\w\W]*$', '', text)
    
    # Remove random html
    text = re.sub(r'<!--[\w\W]+-->', '', text)
    
    # Remove new lines at end of text
    text = re.sub(r'<\n\n>\s+\*?\s?$', '', text)
    text = re.sub(r'<\n\n>\s+(<\n\n>)?\s?(<\n>)?\s?$', '', text)
    text = re.sub(r'<br/>\s*$', '', text)
    
    return text

In [14]:
def text_cleaning(row):
    """This function will clean all the text scrapped for this specific project"""    
    if row['source']=='Milenio':
        return(cleaning_milenio(row['body']))
        
    elif row['source']=='El Universal':
        return(cleaning_universal(row['body']))
    
    elif row['source']=='El Financiero':
        return(cleaning_financiero(row['body']))
    
    elif (row['source']=='La Jornada') and (row['author']=='John Ackerman'):
        return(ackerman_LaJornada_cleaner(row['body']))
    
    elif (row['source']=='Personal website') and (row['author']=='Denisse Dresser'):
        return(dresser_cleaner(row['body']))
    
    elif (row['source']=='Personal website') and (row['author']=='Enrique Krauze'):
        return(krauze_cleaner(row['body']))
    
    elif row['source']=='RT':
        return row['body'].replace('\r\n', ' \n\n ')
    
    else:
        return row['body']

In [15]:
opi_art_clean = opi_art.copy()
opi_art_clean['body']=opi_art_clean.apply(lambda row:text_cleaning(row), axis = 1) 
#opi_art_clean.to_csv('../Data/Data_clean_csv/clean_dataframe.csv', index = False)

  text = re.sub(r'<p><[!--\s\w:/{}]*></p>,?', '', text)
  text = re.sub(r'</?span[!--\s\w:/{}=;\.]*>', '', text)


In [42]:
for author in opi_art_clean['author'].unique():
    file = open(f"../Data/Data_clean_txt/{author}.txt", "w") 
    text = ('| ' * 20).join(list(opi_art_clean[opi_art_clean['author']==author]['body']))
    file.write(text) 
    file.close() 



In [51]:
def mixed_df(df):
    """This function will create a mixed df and txt file in order to train a model containing text from all the authors"""
    val = df[df['author']=='Valeria Moy'].reset_index(drop = True)
    den = df[df['author']=='Denisse Dresser'].reset_index(drop = True)
    enr = df[df['author']=='Enrique Krauze'].reset_index(drop = True)
    joh = df[df['author']=='John Ackerman'].reset_index(drop = True)
    ric =  df[df['author']=='Ricardo Raphael'].reset_index(drop = True)
    
    df_mix = pd.DataFrame()
    for i in range(0,99):
        temp_df = pd.concat([val.loc[[i]], den.loc[[i]], enr.loc[[i]], joh.loc[[i]], ric.loc[[i]]])
        df_mix = pd.concat([df_mix, temp_df])
    return df_mix.reset_index(drop = True)

In [56]:
mix = mixed_df(opi_art_clean)
mix.to_csv('../Data/Data_clean_csv/mixed_dataframe.csv', index = False)