In [73]:
import requests
from bs4 import BeautifulSoup

def webscrapping(url):
    autor = []
    respuesta = requests.get(url)
    soup = BeautifulSoup(respuesta.text, 'html.parser')
    titulos_h3 = soup.find_all('h3')
    for titulo in titulos_h3:
        if titulo.text[0].isdigit() :
            autor.append(titulo.text)
    return autor

In [74]:
jk_rowling = webscrapping('https://psicologiaymente.com/reflexiones/frases-jk-rowling')
allan_poe = webscrapping('https://psicologiaymente.com/reflexiones/frases-edgar-allan-poe')
allan_poe = allan_poe[:50]
cervantes_m = webscrapping('https://psicologiaymente.com/reflexiones/frases-miguel-de-cervantes')
cervantes_m = cervantes_m[:50]

In [75]:
# Bibliotecas para procesamiento de texto
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Lematizador
lemmatizer = WordNetLemmatizer()
# Stopwords en español
stop_words = set(stopwords.words('spanish'))
# Cargar el modelo de spaCy para español
nlp = spacy.load("es_core_news_sm")

def normalize_text(text):
    # Tokenizar el texto
    tokens = word_tokenize(text)
    # Convertir a minúsculas
    tokens_lower = [word.lower() for word in tokens]
    # Eliminar puntuación
    tokens_no_punct = [word for word in tokens_lower if word.isalnum()]
    # Eliminar digitos
    tokens_no_digit = [word for word in tokens_no_punct if not word.isdigit()]
    # Eliminar stopwords
    tokens_no_stopwords = [word for word in tokens_no_digit if word not in stop_words]
    # Unir tokens
    tokens = " ".join(tokens_no_stopwords)
    # Procesar texto con spaCy
    doc = nlp(tokens)
    # Discriminar por tags
    # normalized_text = [word.lemma_ for word in doc if word.pos_ in tags]
    normalized_text = [word.lemma_ for word in doc]
    # Unir en una sola cadena
    normalized_text = " ".join(normalized_text)

    return tokens

In [76]:
jk_rowling_norm = [normalize_text(text) for text in jk_rowling]
allan_poe_norm = [normalize_text(text) for text in allan_poe]
cervantes_m_norm = [normalize_text(text) for text in cervantes_m]

In [77]:
# Etiquetal las frases de jk_rowling con 0 y el resto con 1
jk_rowling_tag = [(text, 0) for text in jk_rowling_norm]
allan_poe_tag = [(text, 1) for text in allan_poe_norm]
cervantes_m_tag = [(text, 1) for text in cervantes_m_norm]

In [78]:
# Unir en un dataframe
import pandas as pd
df = pd.DataFrame(jk_rowling_tag + allan_poe_tag + cervantes_m_tag, columns=['Phrase', 'Author'])

# Guardar el dataframe en un archivo csv
df.to_csv('corpus/authorship_heavy.csv', index=False)