In [147]:
from bs4 import BeautifulSoup
from collections import Counter
import nltk 
from nltk.corpus import stopwords
import os
import pandas as pd
import numpy as np
import string
import re
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\annap\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [93]:
path_train = "../data/training"
path_test = "../data/test"

# Read files

In [94]:
def read_files(main_path:str)->pd.DataFrame:

    path_list = []
    file_list = []
    content_list = []

    for path in os.listdir(main_path):
        _path = os.path.join(main_path, path)
        if os.path.isdir(_path):
            for file in os.listdir(_path):
                _file = os.path.join(_path, file)
                if os.path.isfile(_file):
                    with open(_file, 'r',  errors='ignore') as f:
                        content = f.read()
                    path_list.append(path)
                    file_list.append(file)
                    content_list.append(content)

    return pd.DataFrame( { 'number_file': file_list, 'classe': path_list,  'content': content_list})

In [95]:
df_train = read_files(path_train)
df_train = df_train.astype({'number_file': int, 'content':str})
df_train.head()

Unnamed: 0,number_file,classe,content
0,5,acq,\n\nCOMPUTER TERMINAL SYSTEMS <CPML> COMPLETES...
1,7,acq,\n\nOHIO MATTRESS <OMT> MAY HAVE LOWER 1ST QTR...
2,27,acq,\n\nMCLEAN'S <MII> U.S. LINES SETS ASSET TRANS...
3,28,acq,\n\nCHEMLAWN <CHEM> RISES ON HOPES FOR HIGHER ...
4,44,acq,\n\n<COFAB INC> BUYS GULFEX FOR UNDISCLOSED AM...


In [96]:
df_test = read_files(path_test)
df_test = df_test.astype({'number_file': int, 'content':str})
df_test.head()

Unnamed: 0,number_file,classe,content
0,9613,acq,\n\nSUMITOMO BANK AIMS AT QUICK RECOVERY FROM ...
1,9618,acq,\n\nBOND CORP STILL CONSIDERING ATLAS MINING B...
2,9628,acq,\n\nCRA SOLD FORREST GOLD FOR 76 MLN DLRS - WH...
3,9643,acq,\n\nANHEUSER-BUSCH JOINS BID FOR SAN MIGUEL\n\...
4,9653,acq,\n\nMONIER SAYS BRITAIN'S REDLAND MAY BID FOR ...


# Preprocessing


In this case we will use 2 approaches to represent these documents, TF-IDF and SBERT, therefore, using TF-IDF we will use the following preprocessing:

    - [] convert text to lowercase letters
    - [] removing numbers
    - [] removing symbols
    - [] removing tags (HTML/XML)
    - [] removing urls
    - [] removing stop words
    
To generate these embeddings using Language Models, we will not use any preprocessing and wait for the model to make the best decision.

In [128]:
pp_config = { 'convert': 'lower', 
              'numbers': True,
              'split_line':True,
              'stop_words': True,     
              'symbols':True,       
              'tags': True,
              'url': True,
            }

PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text:str):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

def remove_html(text):
    return BeautifulSoup(text, "html").text

def remove_split_line(text:str):
    return re.sub(r'[\n\t]', '', text)

def remove_numbers(text):
    return re.sub(r'\d', '', text)

def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def preprocess_english_text(text:str, config:dict):

    if config['convert'] == 'lower':
        text = text.lower()
    if config['numbers'] == True:
        text = remove_numbers(text)
    if config['split_line'] == True:
        text = remove_split_line(text)
    if config['stop_words'] == True:   
        text = remove_stopwords(text)
    if config['symbols'] == True:
        text = remove_punctuation(text)
    if config['tags'] == True:
        text = remove_html(text)
    if config['url'] == True:
        text = remove_urls(text)
    
    return text

def count_words(values:np.array):
    
    cnt = Counter()
    for text in values:
        for word in text.split():
            cnt[word] += 1
    return cnt

In [168]:
# Remover os mais comuns e os raros,
# baseados somente nas instâncias de treino, pq as de teste, é como se elas não existissem
cnt = count_words(df_train["processed"].values)
RAREWORDS = set([w for w, w_count in cnt.items() if w_count == 1])
COMMONWORDS = set([w for (w, w_count) in cnt.most_common()[:10]])

df_train['processed2'] = df_train['processed'].apply( lambda x : ' '.join([word for word in x.split() if word not in RAREWORDS]))
df_train['processed2'] = df_train['processed'].apply( lambda x : ' '.join([word for word in x.split() if word not in COMMONWORDS]))

In [170]:
df_train.head()

Unnamed: 0,number_file,classe,content,processed,processed2
0,5,acq,\n\nCOMPUTER TERMINAL SYSTEMS <CPML> COMPLETES...,computer terminal systems cpml completes sale ...,computer terminal systems cpml completes sale ...
1,7,acq,\n\nOHIO MATTRESS <OMT> MAY HAVE LOWER 1ST QTR...,ohio mattress omt may lower st qtr net clevela...,ohio mattress omt may lower st qtr net clevela...
2,27,acq,\n\nMCLEAN'S <MII> U.S. LINES SETS ASSET TRANS...,mcleans mii us lines sets asset transfer cranf...,mcleans mii lines sets asset transfer cranford...
3,28,acq,\n\nCHEMLAWN <CHEM> RISES ON HOPES FOR HIGHER ...,chemlawn chem rises hopes higher bidsauthor ca...,chemlawn chem rises hopes higher bidsauthor ca...
4,44,acq,\n\n<COFAB INC> BUYS GULFEX FOR UNDISCLOSED AM...,cofab inc buys gulfex undisclosed amount houst...,cofab inc buys gulfex undisclosed amount houst...


# Encoding

In [171]:
df_train.shape

(11413, 5)

In [172]:
df_train['number_file'].nunique()

9598

In [179]:
df_train[["number_file", "classe"]].drop_duplicates()

Unnamed: 0,number_file,classe
0,5,acq
1,7,acq
2,27,acq
3,28,acq
4,44,acq
...,...,...
11408,7379,zinc
11409,7855,zinc
11410,8905,zinc
11411,9423,zinc


In [167]:
RAREWORDS = set([w for w, w_count in cnt.items() if w_count == 1])



{'billion', 'cts', 'dlrs', 'march', 'mln', 'pct', 'said', 'us', 'vs', 'year'}