In [2]:
import os
import sys

# Add workspace to path
path = os.getcwd() 
workspace = os.path.abspath(os.path.join(path, os.pardir))
sys.path.append(workspace)

from cleaning.data_importer_boe import load_data
import numpy as np
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.pipeline import Pipeline
import re

nlp = spacy.load("es")

In [13]:
def tokenize(text):
    """
    Apply lemmatiazation and remove stopwords and punctuation tokens in the current text
    """
    
    row = row.lower()
    row = set((token.lemma_ for token in nlp(text) if not token.is_stop and not token.is_punct))
    
    return row

def preprocess(df):
    
    # Drop newlines tokens
    df = df.replace('\n', np.nan, regex=True)
    
    # Drop files with no texto inside
    df = df[~df.texto.isna()]
    
    # Format date columns
    dates_columns = [col for col in df.columns if 'fecha_' in col]
    df[dates_columns] = df[dates_columns].apply(lambda date: pd.to_datetime(date, format='%Y%m%d', errors='coerce'))
    
    # Get tables
    tables = df.texto.str.findall(r'(<table>.*?</table>)')
    tables = pd.DataFrame(tables)[tables.str.len() > 0]
    
    # Drop the tables after get it all
    df.texto = df.texto.apply(lambda row: re.sub(r"<.?table[^>]*>|<.?t[rd]>|<font[^>]+>|<.?b>", "", row)) 
    
    # Get images
    images = df.texto.str.findall(r'src="([^"]+)"')
    images = pd.DataFrame(images)[images.str.len() > 0]
    
    # Drop the images after get it all
    df.texto = df.texto.apply(lambda row: re.sub(r"<.?img[^>]*>|<.?t[rd]>|<font[^>]+>|<.?b>", "", row))  
    
    # Drop every tag
    df.texto = df.texto.apply(lambda row: re.sub(r"<[^>]*>", "", row))  
    
    # Tokenize the text
    df['tokens'] = df.texto.apply(tokenize).apply(lambda row: ' '.join(row))
    
    # TF-IDF calculation
    vocabulary = set([item for sublist in df.tokens.str.split(' ').tolist() for item in sublist])
    pipe = Pipeline([('count', CountVectorizer(vocabulary=vocabulary)),
                     ('tfid', TfidfTransformer())]).fit(df.tokens)
    
    tfidf = pd.DataFrame(pipe.transform(df.tokens).toarray(), index=df.index, columns=vocabulary)
    
    return df, tables, images, tfidf

In [18]:
df = load_data(path_to_data='../data', seccion='1')

In [19]:
processed_df, tables, images, tfidf = preprocess(df)

In [20]:
processed_df

Unnamed: 0,documento,metadatos,identificador,titulo,diario,diario_numero,seccion,subseccion,departamento,rango,...,url_pdf_valenciano,analisis,notas,materias,alertas,referencias,anteriores,posteriores,texto,tokens
0,,,BOE-A-2020-13843,Terminación del Acuerdo entre el Reino de Espa...,Boletín Oficial del Estado,295,1,,"Ministerio de Asuntos Exteriores, Unión Europe...",Nota Diplomática,...,,,,,,,,,el Acuerdo de 28 de junio de 2010,2010 junio 28
1,,,BOE-A-2020-13842,Terminación del Acuerdo entre el Reino de Espa...,Boletín Oficial del Estado,295,1,,"Ministerio de Asuntos Exteriores, Unión Europe...",Nota Diplomática,...,,,,,,,,,del Acuerdo de 28 de junio de 2010,2010 junio 28
2,,,BOE-A-2020-13342,"Resolución de 26 de octubre de 2020, de la Sec...",Boletín Oficial del Estado,289,1,,Ministerio de Trabajo y Economía Social,Resolución,...,,,,,,,,,"el art. 11.2 de la Ley de Empleo, texto refun...",ley artículo decretar texto legislativo real...
3,,,BOE-A-2020-13343,"Resolución de 28 de octubre de 2020, de la Dir...",Boletín Oficial del Estado,289,1,,Ministerio de Trabajo y Economía Social,Resolución,...,,,,,,,,,"el art. 45.4 del Real Decreto 2001/1983, de 28...",decretar artículo 45.4 real 2001/1983 28 julio
4,,,BOE-A-2020-13610,"Resolución de 29 de octubre de 2020, del Conso...",Boletín Oficial del Estado,292,1,,Ministerio de Hacienda,Resolución,...,,,,,,,,,"el art. 3.2 del Real Decreto 1671/2009, de 6 d...",decretar artículo noviembre real 6 1671/2009 3.2
5,,,BOE-A-2020-13467,"Orden TMA/1017/2020, de 23 de octubre, por la ...",Boletín Oficial del Estado,290,1,,"Ministerio de Transportes, Movilidad y Agenda ...",Orden,...,,,,,,,,,"el art. 7.2 del Real Decreto 147/2019, de 15 d...",decretar artículo marzo real 147/2019 7.2 15
6,,,BOE-A-2020-14051,"Ley 3/2020, de 29 de octubre, del sistema de p...",Boletín Oficial del Estado,298,1,,Comunidad Autónoma de Extremadura,Ley,...,,,,,,,,,"el art. 14.a) de la Ley 8/2001, de 14 de junio",14 ley artículo junio 8/2001 14.a
7,,,BOE-A-2020-14050,"Real Decreto 917/2020, de 20 de octubre, por e...",Boletín Oficial del Estado,298,1,,"Ministerio de Inclusión, Seguridad Social y Mi...",Real Decreto,...,,,,,,,,,el art. 67 del Reglamento aprobado por Real De...,887/2006 decretar artículo real aprobar reglam...
8,,,BOE-A-2020-14049,"Resolución de 11 de noviembre de 2020, de la D...",Boletín Oficial del Estado,298,1,,Ministerio de Sanidad,Resolución,...,,,,,,,,,la Resolución de 24 de julio de 2020,julio 24 2020 resolución
9,,,BOE-A-2020-14047,"Real Decreto 914/2020, de 20 de octubre, por e...",Boletín Oficial del Estado,298,1,,"Ministerio de Asuntos Exteriores, Unión Europe...",Real Decreto,...,,,,,,,,,el art. 67 del Reglamento aprobado por Real De...,887/2006 decretar artículo real aprobar reglam...


In [21]:
tables

Unnamed: 0,texto


In [22]:
images

Unnamed: 0,texto


In [23]:
tfidf

Unnamed: 0,Unnamed: 1,210/2018,publicar,14.a,20,decreto-ley,iv,determinar,16,precepto,...,1205/2011,mayar,d,30/2020,capítulo,47/2004,1614/2011,59/2003,6.2,foral
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.289921,0.0,0.0,0.0,0.0,...,0.374842,0.0,0.0,0.0,0.0,0.0,0.374842,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.414286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.543704,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.414286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
