# Auto reload modules

In [1]:
%load_ext autoreload
%autoreload 2

# Directories

In [2]:
from multiprocessing.pool import ThreadPool
from urllib import request
import pandas as pd
import sqlalchemy
import requests
import shutil
import urllib
import pyodbc
import gzip
import re
import fs

In [3]:
RAW_DIR = fs.open_fs("../../data/raw")

In [4]:
def decompress_gz(input_file: str, output_file: str) -> None:
    """
    Descomprime un archivo .gz y guarda el contenido descomprimido en un archivo de salida.

    Parameters:
    - input_file (str): Ruta del archivo comprimido .gz.
    - output_file (str): Ruta del archivo donde se guardará el contenido descomprimido.
    """
    try:
        with gzip.open(input_file, 'rb') as f_in:
            with open(output_file, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        print(f"Archivo {input_file} descomprimido exitosamente como {output_file}")
    except FileNotFoundError:
        print(f"Error: El archivo {input_file} no fue encontrado.")
    except Exception as e:
        print(f"Se produjo un error: {e}")



In [5]:
def download_url(url):
    # Download process
    print("downloading: ",url)
    file_title = re.split(pattern='/', string=url)[-1]
    FILE_DIR = RAW_DIR.getsyspath("gz/"+file_title)
    urlrtv = request.urlretrieve(url=url, filename=FILE_DIR)
   
    decompress_gz(FILE_DIR, FILE_DIR.replace("gz", "tsv", 1).replace(".", "_" ,1).replace(".gz", ""))

In [6]:
# Source: https://developer.imdb.com/non-commercial-datasets/
# Lista de URLs
urls = [
    "https://datasets.imdbws.com/title.episode.tsv.gz",
    "https://datasets.imdbws.com/title.ratings.tsv.gz",
    "https://datasets.imdbws.com/title.akas.tsv.gz",
    "https://datasets.imdbws.com/title.basics.tsv.gz",
    "https://datasets.imdbws.com/title.crew.tsv.gz",
    "https://datasets.imdbws.com/title.principals.tsv.gz",
    "https://datasets.imdbws.com/name.basics.tsv.gz"
]

# Ejecutar múltiples hilos
with ThreadPool(5) as pool:
    pool.map(download_url, urls)

downloading: downloading:  https://datasets.imdbws.com/title.ratings.tsv.gz
 https://datasets.imdbws.com/title.episode.tsv.gz
downloading:  https://datasets.imdbws.com/title.akas.tsv.gz
downloading:  https://datasets.imdbws.com/title.basics.tsv.gz
downloading:  https://datasets.imdbws.com/title.crew.tsv.gz
Archivo c:\Users\Anony\Documents\MCD\Mineria\DM&N\data\raw\gz\title.ratings.tsv.gz descomprimido exitosamente como c:\Users\Anony\Documents\MCD\Mineria\DM&N\data\raw\tsv\title_ratings.tsv
downloading:  https://datasets.imdbws.com/title.principals.tsv.gz
Archivo c:\Users\Anony\Documents\MCD\Mineria\DM&N\data\raw\gz\title.episode.tsv.gz descomprimido exitosamente como c:\Users\Anony\Documents\MCD\Mineria\DM&N\data\raw\tsv\title_episode.tsv
downloading:  https://datasets.imdbws.com/name.basics.tsv.gz
Archivo c:\Users\Anony\Documents\MCD\Mineria\DM&N\data\raw\gz\title.crew.tsv.gz descomprimido exitosamente como c:\Users\Anony\Documents\MCD\Mineria\DM&N\data\raw\tsv\title_crew.tsv
Archivo