In [None]:
import os
import requests
import tarfile
import logging
import project_paths as pp

In [None]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [None]:
def download_dataset_from_web(url: str, download_file_path: str) -> bool:
    '''
    Downloads a dataset from a given URL and saves it to the specified path.
    
    Args:
        url (str): The URL from which to download the dataset
        download_file_path (str): The local file path where the dataset should be saved
        
    Returns:
        bool: True if download was successful, False otherwise
        
    The function downloads the file in chunks to handle large files efficiently.
    If any error occurs during download, it will be logged and the function will return False.
    '''
    download_successful = False
    try:
        # Log the start of download
        logger.info(f'Downloading the dataset from {url} ...')
        
        # Stream the download in chunks
        with requests.get(url, stream=True) as response:
            with open(download_file_path, 'wb') as file:
                # Download and write chunks of 1KB each
                for chunk in response.iter_content(chunk_size=1024):
                    if chunk:  # Filter out keep-alive chunks
                        file.write(chunk)
                        
        # Log successful download
        logger.info(f'Dataset downloaded successfully to "{download_file_path}"!')
        download_successful = True
        
    except Exception as exe:
        logger.info('Trouble download the dataset')
        
    finally:
        return download_successful

In [None]:
# Define the downloaded URL
dataset_url = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
# Define the path where the downloaded dataset will be saved
download_file_path = os.path.join(pp.datasets_folder_path, 'aclImdb_v1.tar.gz')

# Download the dataset from the URL and save it to the specified path
download_dataset_from_web(dataset_url, download_file_path)

# Extract the dataset from the downloaded file
with tarfile.open(download_file_path, 'r:gz') as tar_file:
    tar_file.extractall(path=pp.datasets_folder_path)

# Log the completion of the extraction process
logger.info(f'Dataset extracted at "{pp.datasets_folder_path}"')