### Pipeline to generate the dataset and reprocess it


In [1]:
import nltk
from nltk.corpus import stopwords
import string
from nltk.tokenize import word_tokenize
import wikipediaapi
import re
import spacy


In [7]:
def get_corpus(topics):
    """
    Fetches Wikipedia text content for a list of topics and saves it to a file.

    Args:
        topics (list): A list of topic titles for fetching content.

    Returns:
        None
    """
    user_agent = "university_project/1.0"
    wiki_wiki = wikipediaapi.Wikipedia(user_agent, 'en')
    
    # Fetch content for topics and join them into a single corpus
    corpus = ' '.join([wiki_wiki.page(topic).text for topic in topics]) 
    
    # Replace multiple newlines with a single newline
    corpus = re.sub('\n+', '\n', corpus)
    
    # Write the corpus to a file
    with open('res/corpus.txt', "w") as file:
        file.write(corpus)


def load_topics(file_path):
    """
    Loads topics from a file and returns them as a list.

    Args:
        file_path (str): Path to the file containing topics.

    Returns:
        list: A list of topics.
    """
    topics = []
    with open(file_path, "r") as file:
        for line in file:
            topics.append(line.strip())
    return topics

topics = load_topics('res/guitar_players.txt')

get_corpus(topics)


In [3]:
def load_data(file_path):
    """
    Loads sentences from a file and returns them as a list.

    Args:
        file_path (str): Path to the file containing sentences.

    Returns:
        list: A list of sentences.
    """
    sentences = []

    try:
        with open(file_path, 'r') as file:
            for line in file:
                sentence = line.strip()
                sentences.append(sentence)
    except FileNotFoundError:
        print("File not found.")
    
    return sentences

nlp = spacy.load('en_core_web_sm')
verb = 'play'

def filter_corpus(corpus, verb):
    """
    Filters relevant sentences from the corpus based on a specific verb.

    Args:
        corpus (list): A list of sentences to filter.
        verb (str): The verb to search for in the sentences.

    Returns:
        list: A list of relevant sentences containing the specified verb.
    """
    relevant_sentences = []
    for text in corpus:
        processed_text = nlp(text)
        for sentence in processed_text.sents:
            for token in sentence:
                if token.lemma_ == verb:
                    relevant_sentences.append(sentence.text)
                    break

    with open("../lab3/res/filtered_corpus.txt", "w") as file:
        for sentence in relevant_sentences:
            file.write(sentence + "\n")
    return relevant_sentences

In [4]:
corpus = load_data('../lab3/res/corpus.txt')
filtered_corpus = filter_corpus(corpus, verb)
sentences = load_data('../lab3/res/filtered_corpus.txt')

In [5]:
def clean_text(text):
    """
    Cleans and preprocesses text data by removing stopwords and punctuation.

    Args:
        text (str): Input text to be cleaned.

    Returns:
        str: Cleaned text.
    """
    stop_words = set(stopwords.words('english'))
    punctuation = string.punctuation

    word_tokens = word_tokenize(text)

    filtered_text = [w for w in word_tokens if not w.lower() in stop_words and not w in punctuation]

    clean_text = ' '.join(filtered_text)

    return clean_text

def substitute(sentences, names):
    """
    Substitutes names in sentences with 'person' and cleans the sentences.

    Args:
        sentences (list): List of sentences to process.
        names (list): List of names to be substituted.

    Returns:
        list: List of sentences with substituted names and cleaned text.
    """
    pattern = r'\b(' + '|'.join(re.escape(name) for name in names) + r')\b'

    replaced_sentences = []
    for sentence in sentences:
        replaced_sentence = re.sub(pattern, 'person', sentence, flags=re.IGNORECASE)
        replaced_sentence = re.sub(r'\b(it)\b', 'thing', replaced_sentence, flags=re.IGNORECASE)
        replaced_sentence = clean_text(replaced_sentence)
        replaced_sentences.append(replaced_sentence)

    with open("../lab3/res/sentences.txt", "w") as file:
        for sentence in replaced_sentences:
            file.write(sentence + "\n")

    return replaced_sentences

def load_txt(file_path):
    """
    Loads text data from a file and returns it as a list of rows.

    Args:
        file_path (str): Path to the file containing text data.

    Returns:
        list: List of rows from the file.
    """
    row = []
    with open(file_path, "r") as file:
        for line in file:
            row.append(line.strip())
    return row

In [6]:
names = load_txt('../lab3/res/names.txt')
pronouns = load_txt('../lab3/res/pronouns.txt')
n_and_p = names + pronouns
filtered_sentences = substitute(sentences, n_and_p)
