# INICIALIZACIÓN

In [None]:
# @title INSTALACIÓN DE LIBRERÍAS

# !pip install ffmpeg-python

# !pip install gspread
# !pip install oauth2client

# !pip install nvidia-ml-py


# !pip install git+https://github.com/openai/whisper.git
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126



In [None]:
import sys
import os

# Obtener la ruta absoluta para asegurarse de que funciona correctamente
path_abs = os.path.abspath(os.path.join('utils'))

# Añadir la ruta si no está ya presente
if path_abs not in sys.path:
    sys.path.append(path_abs)

print("Contenido de sys.path:")
for path in sys.path:
    print(path)

import os
import time
import ffmpeg
import pandas as pd


import torch
print("¿CUDA disponible?:", torch.cuda.is_available())
print("Dispositivo CUDA:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "Ninguno")

import whisper
from whisper import load_model

# Para Google Sheets:
import gspread

# Para la autenticación local con Service Account (ejemplo):
from oauth2client.service_account import ServiceAccountCredentials

# Para la autenticación Google Colab:
# from google.colab import auth
# auth.authenticate_user().

from datetime import datetime  

In [None]:
import torch
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, nvmlSystemGetDriverVersion

# Inicializar NVML
nvmlInit()

# Obtener el identificador de la GPU
gpu_handle = nvmlDeviceGetHandleByIndex(0)

# Obtener información de la memoria de la GPU
memory_info = nvmlDeviceGetMemoryInfo(gpu_handle)

# Convertir de bytes a GB
total_memory_gb = memory_info.total / (1024 ** 3)
used_memory_gb = memory_info.used / (1024 ** 3)
free_memory_gb = memory_info.free / (1024 ** 3)

# Obtener información del dispositivo
cuda_available = torch.cuda.is_available()
device_name = torch.cuda.get_device_name(0) if cuda_available else "Ninguno"

print(f"¿CUDA disponible?: {cuda_available}")
print(f"Dispositivo CUDA: {device_name}")
print(f"Memoria total: {total_memory_gb:.2f} GB")
print(f"Memoria usada: {used_memory_gb:.2f} GB")
print(f"Memoria libre: {free_memory_gb:.2f} GB")


## scrap_videofiles_local_dir_to_gspread()

In [None]:
# @title scrap_videofiles_local_dir_to_gspread()
def scrap_videofiles_local_dir_to_gspread(params: dict) -> None:
    """
    Busca archivos locales, extrae propiedades de los videos y envía los resultados a una hoja de Google Sheets.

    Args:
        params (dict):
            - video_files_root_path (str): Ruta del directorio raíz.
            - video_files_target_search_folder (list): Subcarpetas de interés dentro del directorio raíz.
            - video_files_target_search_extension (list): Lista de extensiones de archivo a buscar (e.g., [".mp4"]).
            - google_credentials_file (str): Nombre del archivo JSON de credenciales.
            - google_credentials_folder_path_rel (str): Ruta relativa de la carpeta de credenciales.
            - destination_files_path_table_spreadsheet_url (str): URL de la hoja de Google Sheets.
            - destination_files_path_table_spreadsheet_worksheet (str): Nombre de la hoja dentro del archivo.

    Returns:
        None

    Raises:
        ValueError: Si falta algún parámetro obligatorio o no se encuentran archivos.
        Exception: Si ocurre un error al interactuar con Google Sheets.
    """
    import os
    import subprocess
    import json
    import pandas as pd
    from datetime import datetime
    from time import time
    from dpm_google import gspread_initialize_client, gspread_df_to_sheet

    print("\n==================== INICIO DEL PROCESO ====================\n")

    # Registrar el inicio del proceso
    start_time = time()

    # Validar los parámetros de entrada
    root_path = params.get('video_files_root_path')
    target_folders = params.get('video_files_target_search_folder', [])
    file_exts = params.get('video_files_target_search_extension', [])
    credentials_file = params.get('google_credentials_file')
    credentials_folder_path_rel = params.get('google_credentials_folder_path_rel')
    spreadsheet_url = params.get('destination_files_path_table_spreadsheet_url')
    worksheet_name = params.get('destination_files_path_table_spreadsheet_worksheet')

    if not root_path:
        raise ValueError("El parámetro 'video_files_root_path' es obligatorio.")
    if not file_exts:
        raise ValueError("El parámetro 'video_files_target_search_extension' es obligatorio y debe contener al menos una extensión.")
    if not credentials_file:
        raise ValueError("El parámetro 'google_credentials_file' es obligatorio.")
    if not spreadsheet_url:
        raise ValueError("El parámetro 'destination_files_path_table_spreadsheet_url' es obligatorio.")
    if not worksheet_name:
        raise ValueError("El parámetro 'destination_files_path_table_spreadsheet_worksheet' es obligatorio.")

    print("✔ Parámetros validados correctamente.\n")

    def find_files_in_folders(root_path, target_folders, file_exts):
        results = []
        for root, dirs, files in os.walk(root_path):
            current_folder = os.path.basename(root)
            if target_folders:
                if current_folder not in target_folders:
                    continue

            for file in files:
                file_name, file_ext = os.path.splitext(file)
                if file_ext.lower() in [ext.lower() for ext in file_exts]:
                    file_path = os.path.join(root, file)
                    results.append({
                        "video_file_path": file_path,
                        "video_file_name": file
                    })
                    print(f"    ➤ Archivo encontrado: {file} (Ruta: {file_path})")

        if not results:
            print("⚠ No se encontraron archivos que coincidan con los criterios especificados.\n")
            return pd.DataFrame()

        return pd.DataFrame(results)

    print(f"Buscando archivos en '{root_path}' con extensiones: {file_exts}...\n")
    df_paths = find_files_in_folders(root_path, target_folders, file_exts)

    if df_paths.empty:
        raise ValueError("No se encontraron archivos que coincidan con los criterios especificados.")

    print(f"✔ Total de archivos encontrados: {len(df_paths)}\n")

    def extract_video_properties(file_path):
        try:
            file_size = os.path.getsize(file_path) // (1024 * 1024)

            result = subprocess.run([
                'ffprobe', '-v', 'error', '-print_format', 'json', '-show_streams', '-show_format', file_path
            ], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
            info = json.loads(result.stdout)

            video_codec = audio_codec = None
            video_bitrate = audio_bitrate = None
            video_width = video_height = None
            video_fps = None
            audio_channels = audio_sample_rate_hz = None
            duration = duration_ms = duration_hms = None

            if 'streams' in info:
                for stream in info['streams']:
                    if stream['codec_type'] == 'video':
                        video_codec = stream.get('codec_name')
                        video_bitrate = int(stream.get('bit_rate', 0)) // 1000
                        video_width = stream.get('width')
                        video_height = stream.get('height')
                        if 'r_frame_rate' in stream:
                            num, den = map(int, stream['r_frame_rate'].split('/'))
                            video_fps = num / den
                    elif stream['codec_type'] == 'audio':
                        audio_codec = stream.get('codec_name')
                        audio_bitrate = int(stream.get('bit_rate', 0)) // 1000
                        audio_channels = stream.get('channels')
                        audio_sample_rate_hz = int(stream.get('sample_rate', 0))

            if 'format' in info:
                duration = float(info['format'].get('duration', 0))
                duration_ms = int(duration * 1000)
                duration_hms = "{:02d}:{:02d}:{:02d}".format(
                    int(duration) // 3600, (int(duration) % 3600) // 60, int(duration) % 60
                )

            return {
                "file_name": os.path.basename(file_path),
                "file_path": file_path,
                "file_creation_date": datetime.fromtimestamp(os.path.getctime(file_path)).strftime('%Y-%m-%d %H:%M:%S'),
                "file_last_modified_date": datetime.fromtimestamp(os.path.getmtime(file_path)).strftime('%Y-%m-%d %H:%M:%S'),
                "file_scrap_date": datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                "file_size_mb": file_size,
                "duration_hms": duration_hms,
                "duration_ms": duration_ms,
                "video_codec": video_codec,
                "video_bitrate_kbps": video_bitrate,
                "video_fps": video_fps,
                "video_resolution": f"{video_width}x{video_height}",
                "audio_codec": audio_codec,
                "audio_bitrate_kbps": audio_bitrate,
                "audio_channels": audio_channels,
                "audio_sample_rate_hz": audio_sample_rate_hz,
            }

        except Exception as e:
            print(f"⚠ Error al obtener las propiedades del vídeo: {e}")
            return {
                "file_name": os.path.basename(file_path),
                "file_path": file_path,
                "file_creation_date": "unknown",
                "file_last_modified_date": "unknown",
                "file_scrap_date": datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                "file_size_mb": 0,
                "duration_hms": "00:00:00",
                "duration_ms": 0,
                "video_codec": "unknown",
                "video_bitrate_kbps": 0,
                "video_fps": 0,
                "video_resolution": "unknown",
                "audio_codec": "unknown",
                "audio_bitrate_kbps": 0,
                "audio_channels": 0,
                "audio_sample_rate_hz": 0,
            }

    print("Extrayendo propiedades de los videos:\n")
    df_video_props = df_paths['video_file_path'].apply(extract_video_properties).apply(pd.Series)

    print("✔ Propiedades extraídas correctamente.\n")

    print("Generando el DataFrame final con todas las propiedades...\n")
    df_paths_properties = df_video_props[
        [
            "file_name",
            "file_path",
            "file_creation_date",
            "file_last_modified_date",
            "file_scrap_date",
            "file_size_mb",
            "duration_hms",
            "duration_ms",
            "video_codec",
            "video_bitrate_kbps",
            "video_fps",
            "video_resolution",
            "audio_codec",
            "audio_bitrate_kbps",
            "audio_channels",
            "audio_sample_rate_hz",
        ]
    ]

    # Guardar datos localmente como respaldo
    backup_csv_path = "video_files_backup.csv"
    df_paths_properties.to_csv(backup_csv_path, index=False)
    print(f"✔ Datos guardados localmente como respaldo en: {backup_csv_path}\n")

    # Estadísticas al final del proceso
    total_videos = len(df_paths_properties)
    print(f"\n==================== ESTADÍSTICAS ====================")
    print(f"✔ Número total de videos procesados: {total_videos}")

    # Duración del proceso
    end_time = time()
    process_duration = end_time - start_time
    print(f"✔ Duración total del proceso: {process_duration:.2f} segundos\n")

    print("Inicializando cliente de Google Sheets...\n")
    client = gspread_initialize_client(credentials_folder_path_rel, credentials_file)

    if not client:
        raise Exception("No se pudo autenticar con la hoja de Google Sheets.")

    print("✔ Cliente autenticado correctamente.\n")

    print("Enviando datos a Google Sheets...\n")
    gspread_df_to_sheet({
        'client': client,
        'spreadsheet_url': spreadsheet_url,
        'worksheet_name': worksheet_name,
        'df': df_paths_properties,
        'row_filter_list': list(range(len(df_paths_properties))),
        'col_filter_list': '',
        'include_header': True
    })
    print("✔ Datos enviados exitosamente.\n")

    # Eliminar el archivo de respaldo local si la carga fue exitosa
    os.remove(backup_csv_path)
    print(f"✔ Respaldo local eliminado: {backup_csv_path}\n")

    print("==================== FIN DEL PROCESO ====================\n")


## orchestrate_transcription_to_GSpread

In [None]:
# @title orchestrate_transcription_to_GSpread()
def orchestrate_transcription_to_GSpread(params: dict) -> None:
    """
    Para cada fila en la hoja fuente de Google Sheets:
      1) Se obtiene el 'file_path' (field_name_for_file_path).
      2) Se transcribe el contenido del vídeo (usando Whisper u otro modelo).
      3) Se trocea la transcripción completa en partes de <= 50,000 caracteres.
      4) Se trocea la transcripción segmentada (con timestamps) de la misma forma.
      5) Se escribe en la hoja destino:
         - file_path
         - transcription_date
         - transcription_duration
         - whisper_model
         - GPU_model
         - transcription_part_1 ... transcription_part_10
         - transcription_seg_part_1 ... transcription_seg_part_10

    Args:
        params (dict): Diccionario con las claves:
            - google_credentials_file (str): Archivo JSON de credenciales.
            - google_credentials_folder_path_rel (str): Carpeta donde está el archivo JSON.
            - source_files_path_table_spreadsheet_url (str): URL de la hoja fuente.
            - source_files_path_table_spreadsheet_worksheet (str): Nombre de la worksheet en la hoja fuente.
            - field_name_for_file_path (str): Nombre de la columna con la ruta del vídeo (ej. "file_path").
            - transcription_destination_spreadsheet_url (str): URL de la hoja destino.
            - transcription_destination_spreadsheet_worksheet (str): Nombre de la worksheet en la hoja destino.
            - whisper_model_size (str): Tamaño del modelo Whisper ("small", "medium", etc.).
            - whisper_language (str): Idioma de la transcripción (ejemplo: "es" para español).
            - google_api_retries (int): Número de intentos en caso de error de API.
    """
    import os
    import time
    import gspread
    from oauth2client.service_account import ServiceAccountCredentials
    from datetime import datetime
    import whisper

    start_time = datetime.now()
    print(f"\n=== Iniciando proceso de transcripción y escritura en Google Sheets | {start_time} ===\n", flush=True)

    # Validar parámetros esenciales
    required_keys = [
        "google_credentials_file", "google_credentials_folder_path_rel",
        "source_files_path_table_spreadsheet_url", "source_files_path_table_spreadsheet_worksheet",
        "transcription_destination_spreadsheet_url", "transcription_destination_spreadsheet_worksheet",
        "field_name_for_file_path", "whisper_model_size"
    ]
    for key in required_keys:
        if not params.get(key):
            raise ValueError(f"El parámetro '{key}' es obligatorio.")

    google_api_retries = params.get("google_api_retries", 3)
    field_name_for_file_path = params["field_name_for_file_path"]
    whisper_model_size = params["whisper_model_size"]
    whisper_language = params.get("whisper_language", "en")  # Idioma por defecto: inglés

    # Inicializar cliente de Google Sheets
    print("Autenticando con Google Sheets...", flush=True)
    credentials_path = os.path.join(params["google_credentials_folder_path_rel"], params["google_credentials_file"])
    scope = ["https://www.googleapis.com/auth/spreadsheets", "https://www.googleapis.com/auth/drive"]
    creds = ServiceAccountCredentials.from_json_keyfile_name(credentials_path, scope)
    client = gspread.authorize(creds)
    print("Autenticación exitosa.\n", flush=True)

    # Leer hoja fuente
    print(f"Accediendo a la hoja fuente: {params['source_files_path_table_spreadsheet_worksheet']} en {params['source_files_path_table_spreadsheet_url']}...", flush=True)
    source_sheet = client.open_by_url(params["source_files_path_table_spreadsheet_url"]).worksheet(params["source_files_path_table_spreadsheet_worksheet"])
    all_values = source_sheet.get_all_values()
    if not all_values:
        raise ValueError("La hoja fuente está vacía o no se ha podido leer.")

    header = all_values[0]
    data_rows = all_values[1:]
    source_data = []
    for row in data_rows:
        row_dict = {}
        for i, col_name in enumerate(header):
            col_name_clean = col_name.strip()
            row_dict[col_name_clean] = row[i] if i < len(row) else ""
        source_data.append(row_dict)

    if not source_data or field_name_for_file_path not in source_data[0]:
        raise ValueError(f"La columna '{field_name_for_file_path}' no existe en la hoja fuente.")

    print(f"Hoja fuente cargada correctamente. {len(source_data)} filas leídas.\n", flush=True)

    # Preparar hoja destino
    print(f"Preparando hoja destino: {params['transcription_destination_spreadsheet_worksheet']} en {params['transcription_destination_spreadsheet_url']}...", flush=True)
    spreadsheet_dest = client.open_by_url(params["transcription_destination_spreadsheet_url"])
    try:
        destination_sheet = spreadsheet_dest.worksheet(params["transcription_destination_spreadsheet_worksheet"])
    except gspread.WorksheetNotFound:
        print(f"La hoja destino no existe. Creándola...", flush=True)
        destination_sheet = spreadsheet_dest.add_worksheet(
            title=params["transcription_destination_spreadsheet_worksheet"], rows=1000, cols=30
        )

    # Limpiar hoja destino y crear encabezados
    destination_sheet.clear()
    dest_header = [
        "file_path", "transcription_date", "transcription_duration", "whisper_model", "GPU_model"
    ] + [
        f"transcription_part_{i}" for i in range(1, 11)
    ] + [
        f"transcription_seg_part_{i}" for i in range(1, 11)
    ]
    destination_sheet.update(range_name="A1", values=[dest_header])
    print("Hoja destino preparada y encabezados definidos.\n", flush=True)

    # Cargar el modelo Whisper una sola vez
    print(f"Cargando modelo Whisper '{whisper_model_size}'...", flush=True)
    model = whisper.load_model(whisper_model_size)
    gpu_model = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No"
    print("Modelo Whisper cargado.\n", flush=True)

    def trocear_texto(texto: str, max_chars: int = 50000, max_partes: int = 10) -> list:
        """
        Toma un string y lo trocea en partes de longitud <= max_chars.
        Devuelve una lista con un máximo de max_partes trozos.
        Rellena con "" en caso de que haya menos trozos de los esperados.
        """
        trozos = [texto[i:i + max_chars] for i in range(0, len(texto), max_chars)]
        trozos = trozos[:max_partes]
        if len(trozos) < max_partes:
            trozos += [""] * (max_partes - len(trozos))
        return trozos

    # Procesar fila por fila
    filas_totales = len(source_data)
    for idx, row_data in enumerate(source_data, start=1):
        video_path_value = row_data[field_name_for_file_path]

        if not video_path_value:
            continue

        print(f"({idx}/{filas_totales}) Transcribiendo archivo: {video_path_value} (idioma='{whisper_language}')...", flush=True)
        start_transcription_time = time.time()
        try:
            result = model.transcribe(video_path_value, language=whisper_language)
            transcription_full = result["text"] or ""
            transcription_segments_full = "".join(
                [f"[{seg['start']:.2f}s - {seg['end']:.2f}s]: {seg['text']}\n" for seg in result.get("segments", [])]
            )
        except Exception as e:
            print(f"Error al transcribir {video_path_value}: {e}", flush=True)
            continue

        duration = round(time.time() - start_transcription_time, 2)
        transcription_parts = trocear_texto(transcription_full)
        transcription_seg_parts = trocear_texto(transcription_segments_full)
        transcription_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        row_to_insert = [
            video_path_value, transcription_date, duration, whisper_model_size, gpu_model
        ] + transcription_parts + transcription_seg_parts

        for attempt in range(google_api_retries):
            try:
                destination_sheet.append_row(row_to_insert, value_input_option="USER_ENTERED")
                break
            except gspread.exceptions.APIError as e:
                if attempt < google_api_retries - 1:
                    print(f"Error al escribir fila {idx}, reintentando... ({attempt + 1}/{google_api_retries})", flush=True)
                    time.sleep(5)
                else:
                    print(f"Error persistente al escribir fila {idx}: {e}", flush=True)
                    raise

    end_time = datetime.now()
    print(f"\n=== Proceso completado | {end_time} ===", flush=True)
    print(f"Duración total: {end_time - start_time}", flush=True)


# EJECUCIONES

## SCRAP DE ARCHIVOS DE VIDEO

In [6]:
%%time

# Ejemplo de uso de scrap_videofiles_local_dir_to_gspread()

params = {
    'google_credentials_file': 'animum-dev-apps-dbcd013fb088.json',
    'google_credentials_folder_path_rel': 'api_keys',
    
    'video_files_root_path': r"G:\Otros ordenadores\Mi PC\D",
    'video_files_target_search_folder': [], # ["CAPITULO EXPORTAR", "CAPITULOS EXPORTAR"],
    'video_files_target_search_extension': [".mp4"],
    
    'destination_files_path_table_spreadsheet_url': "https://docs.google.com/spreadsheets/d/1dMut8Smy2DENEr6r9s7pP2U0_yquY9WgQwLuOSrh6iI",
    'destination_files_path_table_spreadsheet_worksheet': 'ResponseAPI2'
}

try: 
    scrap_videofiles_local_dir_to_gspread(params)
    print("Datos enviados exitosamente a Google Sheets.")
except ValueError as e:
    print(f"Error: {e}")
except Exception as e:
    print(f"Error inesperado: {e}")




✔ Parámetros validados correctamente.

Buscando archivos en 'G:\Otros ordenadores\Mi PC\D' con extensiones: ['.mp4']...

    ➤ Archivo encontrado: Helen Mirren Teaches Acting.mp4 (Ruta: G:\Otros ordenadores\Mi PC\D\Helen Mirren Teaches Acting\Helen Mirren Teaches Acting.mp4)
    ➤ Archivo encontrado: GMT20230421-090822_Recording_1920x1080.mp4 (Ruta: G:\Otros ordenadores\Mi PC\D\FORMACION INTERNA CANVAS\2 Cursos\1 Fundamentales de Cursos 1\GMT20230421-090822_Recording_1920x1080.mp4)
    ➤ Archivo encontrado: GMT20230421-090822_Recording_gallery_1920x1080.mp4 (Ruta: G:\Otros ordenadores\Mi PC\D\FORMACION INTERNA CANVAS\2 Cursos\1 Fundamentales de Cursos 1\GMT20230421-090822_Recording_gallery_1920x1080.mp4)
    ➤ Archivo encontrado: GMT20230412-133818_Recording_gallery_1920x1080.mp4 (Ruta: G:\Otros ordenadores\Mi PC\D\FORMACION INTERNA CANVAS\1 Administración\3 Configuración y ampliación de Canvas\GMT20230412-133818_Recording_gallery_1920x1080.mp4)
    ➤ Archivo encontrado: GMT20230412-

  df.fillna("", inplace=True)


Hoja de Google Sheets actualizada con éxito.
✔ Datos enviados exitosamente.

✔ Respaldo local eliminado: video_files_backup.csv


Datos enviados exitosamente a Google Sheets.
CPU times: total: 1.12 s
Wall time: 36min 21s


## TRANSCRIPCIONES

In [None]:
params_template = {
    'google_credentials_file': 'animum-dev-apps-dbcd013fb088.json',
    'google_credentials_folder_path_rel': 'api_keys',
    "google_api_retries": 3,
    
    "source_files_path_table_spreadsheet_url": "https://docs.google.com/spreadsheets/d/1EEHwPEf6fWckLAEo37HYd7eOoaAp7siiZyDxy5MFaa8",
    "source_files_path_table_spreadsheet_worksheet": "SOURCE",
    "field_name_for_file_path": "file_path",
    
    "transcription_destination_spreadsheet_url": "https://docs.google.com/spreadsheets/d/1EEHwPEf6fWckLAEo37HYd7eOoaAp7siiZyDxy5MFaa8",
    "transcription_destination_spreadsheet_worksheet": "ResponseAPI", 
    
    "whisper_model_size": "large",  # Modelo Whisper
    "whisper_language": "en"
}

try:
    orchestrate_transcription_to_GSpread(params_template)
    print("Proceso completado con éxito.")
except ValueError as e:
    print(f"Error en los parámetros: {e}")
except Exception as e:
    print(f"Error inesperado: {e}")



=== Iniciando proceso de transcripción y escritura en Google Sheets | 2025-02-13 11:13:12.801171 ===

Autenticando con Google Sheets...
Autenticación exitosa.

Accediendo a la hoja fuente: SOURCE en https://docs.google.com/spreadsheets/d/1EEHwPEf6fWckLAEo37HYd7eOoaAp7siiZyDxy5MFaa8...
Hoja fuente cargada correctamente. 248 filas leídas.

Preparando hoja destino: ResponseAPI en https://docs.google.com/spreadsheets/d/1EEHwPEf6fWckLAEo37HYd7eOoaAp7siiZyDxy5MFaa8...
Hoja destino preparada y encabezados definidos.

Cargando modelo Whisper 'large'...
