# 1. Data Acquisition Module

This module is responsible for acquiring and loading the text data from various sources like text files, CSV files, or APIs.

In [3]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
from typing import List, Dict, Any
import logging

Function load_data_from_file takes a file path as input and checks whether the file exists. 
If the file does not exist, a FileNotFoundError is raised. 
Otherwise, the function reads the content of the file line by line using the readlines() method and returns 
a list of strings containing the text data. The function uses the 'utf-8' encoding to handle various character sets.



In [4]:
def load_data_from_file(file_path: str) -> List[str]:
    """
    Load text data from a file.
    :param file_path: str, the path to the text file
    :return: List[str], a list of strings containing the text data
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"The file '{file_path}' does not exist.")
        
    # with open(file_path, 'r', encoding='utf-8') as file:
    with open(file_path, 'r') as file:
        data = file.readlines()

    logging.info(f'Loaded {len(data)} lines from "{file_path}".')

    # ascii_data = data.encode('ascii', errors='replace').decode('ascii')
        
    return data



In [None]:
def convert_file_to_ascii_encoding(input_filename: str, output_filename: str):
    """
    Read the contents of a file and save it with ASCII encoding.
    
    Parameters:
    - input_filename (str): The name of the file to be read.
    - output_filename (str): The name of the file where the ASCII-encoded content should be saved.
    """
    with open(input_filename, 'r') as file:
        contents = file.read()

    # Convert to ASCII and handle non-ASCII characters using 'replace' error strategy
    ascii_contents = contents.encode('ascii', errors='replace').decode('ascii')

    with open(output_filename, 'w', encoding='ascii') as file:
        file.write(ascii_contents)

This function load_data_from_csv takes a file path and a column name as input. 
It checks whether the file exists, and if the file does not exist, a FileNotFoundError is raised. 
The function then reads the CSV file into a pandas DataFrame using the read_csv() method with 'utf-8' encoding. 
If the specified column name does not exist in the DataFrame, a ValueError is raised. 
Otherwise, the function extracts the data from the specified column and converts it into a list of strings using the tolist() method. 
Finally, the list of strings containing the text data is returned.



In [5]:
def load_data_from_csv(file_path: str, column_name: str) -> List[str]:
    """
    Load text data from a specific column in a CSV file.
    :param file_path: str, the path to the CSV file
    :param column_name: str, the name of the column containing the text data
    :return: List[str], a list of strings containing the text data
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"The file '{file_path}' does not exist.")
    
    df = pd.read_csv(file_path, encoding='utf-8')
    
    if column_name not in df.columns:
        raise ValueError(f"The column '{column_name}' does not exist in the CSV file.")
        
    data = df[column_name].tolist()
    
    return data



This function load_data_from_web takes a URL as input and attempts to fetch the web page using the requests.get() method. 
If the request fails, a ValueError is raised with the corresponding error message. 
Otherwise, the function proceeds to parse the HTML content of the web page using BeautifulSoup with the 'html.parser' parser.

The function then finds all the paragraph elements (\<p\>) in the parsed HTML using the find_all() method. 
It extracts the text content of each paragraph element using the get_text() method and stores it in a list. 
Finally, the function returns the list of strings containing the text data.



In [6]:
# TODO
def load_data_from_web(url: str) -> List[str]:
    """
    Scrape text data from a web page.
    :param url: str, the URL of the web page
    :return: List[str], a list of strings containing the text data
    """
    try:
        response = requests.get(url)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        raise ValueError(f"Failed to load data from URL '{url}': {e}")
    
    soup = BeautifulSoup(response.content, 'html.parser')
    paragraphs = soup.find_all('p')
    
    data = [paragraph.get_text() for paragraph in paragraphs]
    
    return data



This function load_data_from_api takes an API endpoint URL and a dictionary of parameters as input. 
It attempts to fetch data from the API using the requests.get() method with the provided parameters. 
If the request fails, a ValueError is raised with the corresponding error message. 
Otherwise, the function proceeds to parse the JSON content of the API response using the response.json() method.

The function then processes the JSON data to extract the text data. 
The specific processing required depends on the structure of the JSON data returned by the API. 
In this example, it is assumed that the JSON data contains a key called "results" that holds a list of dictionaries, 
each containing a key "text" with the text data. 
The function iterates through the list of dictionaries and extracts the text data, storing it in a list. 
Finally, the function returns the list of strings containing the text data. Note that this is just an example, 
and you may need to modify the processing logic based on the specific API you are using.



In [7]:
# TODO
def load_data_from_api(api_endpoint: str, params: Dict[str, Any]) -> List[str]:
    """
    Retrieve text data from an API endpoint.
    :param api_endpoint: str, the URL of the API endpoint
    :param params: Dict[str, Any], a dictionary of parameters to be sent in the API request
    :return: List[str], a list of strings containing the text data
    """
    try:
        response = requests.get(api_endpoint, params=params)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        raise ValueError(f"Failed to load data from API endpoint '{api_endpoint}': {e}")
    
    json_data = response.json()

    # Process the JSON data to extract the text data. The specific processing depends
    # on the structure of the JSON data returned by the API. This is just an example.
    data = [item["text"] for item in json_data["results"]]
    
    return data

This function load_data_from_pubmed takes an API endpoint URL and a dictionary of parameters as input. 
It attempts to fetch data from the API using the requests.get() method with the provided parameters. 
If the request fails, a ValueError is raised with the corresponding error message. 
Otherwise, the function proceeds to parse the JSON content of the API response using the response.json() method.

The function then processes the JSON data to extract the text data. 
The specific processing required depends on the structure of the JSON data returned by the API. 
In this example, it is assumed that the JSON data contains a key called "results" that holds a list of dictionaries, 
each containing a key "text" with the text data. 
The function iterates through the list of dictionaries and extracts the text data, storing it in a list. 
Finally, the function returns the list of strings containing the text data. Note that this is just an example, 
and you may need to modify the processing logic based on the specific API you are using.


In [8]:
# TODO
def load_data_from_pubmed(api_endpoint: str, params: Dict[str, Any]) -> List[str]:
    """
    Retrieve text data from PubMed.
    :param api_endpoint: str, the URL of the API endpoint
    :param params: Dict[str, Any], a dictionary of parameters to be sent in the API request
    :return: List[str], a list of strings containing the text data
    """
    try:
        response = requests.get(api_endpoint, params=params)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        raise ValueError(f"Failed to load data from API endpoint '{api_endpoint}': {e}")
    
    json_data = response.json()

    # Process the JSON data to extract the text data. The specific processing depends
    # on the structure of the JSON data returned by the API. This is just an example.
    data = [item["text"] for item in json_data["results"]]
    
    return data