# Imports

In [1]:
!pip install transformers datasets torch langdetect huggingface_hub

import pandas as pd
import numpy as np
import datetime
import sqlite3
import re
import gc
import os
import zipfile
import json
import langdetect
import torch
import pyarrow.parquet as pq

from google.colab import userdata
from huggingface_hub import login
from IPython.display import display
from sklearn.preprocessing import LabelEncoder
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer


Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata

#Settings


In [2]:
data_lake_root = '/content/data_lake'
os.makedirs(data_lake_root, exist_ok=True)

database_root = f'{data_lake_root}/sqlite'
os.makedirs(database_root, exist_ok=True)

login(token=userdata.get('HF_TOKEN'))

# Utils

In [3]:
def log_operation(message):
    print(f"[{datetime.datetime.now()}] {message}")

def get_db_connection():
    return sqlite3.connect(f'{data_lake_root}/sqlite/db')

# Data Ingestion

In [4]:
os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY')
os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USERNAME')

!kaggle datasets download -d piyushjain16/amazon-product-data

file_path = '/content/dataset/train.csv'

if not os.path.exists(file_path):
    !unzip -o "amazon-product-data.zip"
else:
    print(f"File {file_path} already exists. Skipping unzip.")

dataset_csv_path = 'dataset/train.csv'

Dataset URL: https://www.kaggle.com/datasets/piyushjain16/amazon-product-data
License(s): Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)
Downloading amazon-product-data.zip to /content
100% 645M/646M [00:33<00:00, 23.7MB/s]
100% 646M/646M [00:33<00:00, 20.3MB/s]
Archive:  amazon-product-data.zip
  inflating: dataset/train.csv       


# Catalogs

## Landing Zone

In [5]:
log_operation("Starting Amazon products dataset loading")

landing_zone = f'{data_lake_root}/landing_zone'
os.makedirs(landing_zone, exist_ok=True)

try:
  # Read the file with options to handle possible encoding and delimiter issues
  df = pd.read_csv(dataset_csv_path, encoding='utf-8', low_memory=False)
  df = df.head(100)  # Selects the first 500,000 rows

  # Check basic information
  log_operation(f"Dataset loaded successfully. Dimensions: {df.shape}")

  # Display dataset information
  print("\nDataset Information:")
  print(f"Number of records: {df.shape[0]}")
  print(f"Number of columns: {df.shape[1]}")
  print("\nColumns present in the dataset:")
  print(df.columns.tolist())

  # Check for missing values
  print("\nCount of missing values per column:")
  print(df.isnull().sum())

  # Count unique values per column
  print("\nUnique Values per Column:")
  print(df.nunique())

  # Show first rows for inspection
  print("\nFirst 5 rows of the dataset:")
  print(df.head())

  # Save the original file in the landing zone with a timestamp
  timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
  landing_file = f"{landing_zone}/amazon_products_raw_{timestamp}.parquet"

  # Save in parquet format for better compression and performance
  df.to_parquet(landing_file, index=False)
  log_operation(f"Dataset saved in the Landing Zone: {landing_file}")

  # Register basic metadata
  metadata = {
      'source_file': file_path,
      'landing_file': landing_file,
      'rows': df.shape[0],
      'columns': df.shape[1],
      'column_names': df.columns.tolist(),
      'timestamp': timestamp,
      'null_counts': df.isnull().sum().to_dict()
  }

  # Save metadata
  with open(f"{landing_zone}/metadata_{timestamp}.json", 'w') as f:
      json.dump(metadata, f, indent=2)

  log_operation("Loading process into the Landing Zone successfully completed")
  log_operation("Starting SQL Microtransformation")

  conn = get_db_connection()
  cursor = conn.cursor()

  cursor.execute('DROP TABLE IF EXISTS products')
  cursor.execute('''
      CREATE TABLE products (
          PRODUCT_ID INTEGER PRIMARY KEY,
          TITLE TEXT,
          BULLET_POINTS TEXT,
          DESCRIPTION TEXT,
          PRODUCT_TYPE_ID INTEGER,
          PRODUCT_LENGTH REAL
      )
  ''')
  conn.commit()

  df['PRODUCT_ID'] = df['PRODUCT_ID'].astype('int32')
  df['PRODUCT_TYPE_ID'] = df['PRODUCT_TYPE_ID'].astype('int32')
  df['PRODUCT_LENGTH'] = df['PRODUCT_LENGTH'].astype('float32')
  df['TITLE'] = df['TITLE'].astype('category')
  df['BULLET_POINTS'] = df['BULLET_POINTS'].astype('category')
  df['DESCRIPTION'] = df['DESCRIPTION'].astype('category')

  df.to_sql('products', conn, if_exists='replace', index=False)

  # Fill missing DESCRIPTION values with BULLET_POINTS (filled_description)
  query = '''
  SELECT
      PRODUCT_ID,
      TITLE,
      BULLET_POINTS,
      DESCRIPTION,
      PRODUCT_TYPE_ID,
      PRODUCT_LENGTH,
      COALESCE(DESCRIPTION, BULLET_POINTS) AS filled_description
  FROM products
  '''
  df_transformed = pd.read_sql_query(query, conn)

  print("\nFirst 5 records after transformations:")
  print(df_transformed.head())

  print("\nCount of missing values per column:")
  print(df_transformed.isnull().sum())

  conn.close()

  del df, df_transformed
  gc.collect()
  log_operation("Microtransformation process successfully completed")

except Exception as e:
    log_operation(f"Error during dataset loading: {str(e)}")


[2025-03-03 13:23:12.336821] Starting Amazon products dataset loading
[2025-03-03 13:23:47.775718] Dataset loaded successfully. Dimensions: (100, 6)

Dataset Information:
Number of records: 100
Number of columns: 6

Columns present in the dataset:
['PRODUCT_ID', 'TITLE', 'BULLET_POINTS', 'DESCRIPTION', 'PRODUCT_TYPE_ID', 'PRODUCT_LENGTH']

Count of missing values per column:
PRODUCT_ID          0
TITLE               0
BULLET_POINTS      44
DESCRIPTION        52
PRODUCT_TYPE_ID     0
PRODUCT_LENGTH      0
dtype: int64

Unique Values per Column:
PRODUCT_ID         100
TITLE              100
BULLET_POINTS       56
DESCRIPTION         48
PRODUCT_TYPE_ID     87
PRODUCT_LENGTH      69
dtype: int64

First 5 rows of the dataset:
   PRODUCT_ID                                              TITLE  \
0     1925202  ArtzFolio Tulip Flowers Blackout Curtain for D...   
1     2673191  Marks & Spencer Girls' Pyjama Sets T86_2561C_N...   
2     2765088  PRIKNIK Horn Red Electric Air Horn Compressor ... 

## Standardized Zone

In [6]:
standardized_zone = f'{data_lake_root}/standardized_zone'
os.makedirs(standardized_zone, exist_ok=True)

try:

  log_operation("Searching for the most recent file in the Landing Zone")

  landing_files = [f for f in os.listdir(landing_zone) if f.endswith('.parquet')]
  latest_file = sorted(landing_files)[-1]
  landing_file_path = f"{landing_zone}/{latest_file}"

  log_operation(f"Raw file found: {landing_file_path}")

  # Load the dataset
  df = pd.read_parquet(landing_file_path)
  log_operation(f"Dataset loaded successfully. Dimensions: {df.shape}")

  log_operation("Standardizing column names")
  df.columns = [col.lower().replace(' ', '_') for col in df.columns]

  log_operation("Starting data cleaning and standardization")

  # Function to remove special characters and HTML
  def clean_text(text):
      if pd.isna(text):
          return text

      # Convert to string
      text = str(text)

      # Remove HTML tags
      text = re.sub(r'<.*?>', '', text)

      # Remove special characters and keep only alphanumeric, spaces, and basic punctuation
      text = re.sub(r'[^\w\s.,;:!?-]', ' ', text)

      # Remove multiple spaces
      text = re.sub(r'\s+', ' ', text)

      return text.strip()

  # Apply cleaning to text columns
  text_columns = df.select_dtypes(include=['object']).columns.tolist()
  for col in text_columns:
      log_operation(f"Cleaning text column: {col}")
      df[col] = df[col].apply(clean_text)

  # 3. Handling missing values
  log_operation("Handling missing values")

  # For text columns, replace NaN with an empty string
  for col in text_columns:
      df[col] = df[col].fillna('')

  # For numeric columns, replace NaN with 0 or mean, depending on the context
  numeric_columns = df.select_dtypes(include=['number']).columns.tolist()

  for col in numeric_columns:
      df[col] = df[col].fillna(0)

  # 4. Extract features from description and bullet points
  log_operation("Extracting features from text")

  # Check if there is a description column
  if 'description' in df.columns:
      # Create feature for description length
      df['description_length'] = df['description'].apply(lambda x: len(str(x)))

      # Create feature for counting keywords
      keywords = ['quality', 'premium', 'best', 'new', 'improved']
      df['keyword_count'] = df['description'].apply(
          lambda x: sum(1 for keyword in keywords if keyword.lower() in str(x).lower())
      )

  # Check if there is a bullet points column
  if 'bullet_points' in df.columns:
      # Count number of bullet points
      df['bullet_count'] = df['bullet_points'].apply(
          lambda x: len(str(x).split('\n')) if pd.notna(x) else 0
      )

  # 6. Encoding categorical variables
  log_operation("Encoding categorical variables")

  # Identify categorical columns (with few unique values)
  categorical_columns = []
  for col in text_columns:
      if df[col].nunique() < 50:  # Arbitrary limit, adjust as needed
          categorical_columns.append(col)

  # Encode categorical variables
  label_encoders = {}
  for col in categorical_columns:
      le = LabelEncoder()
      not_null = df[col].notna()
      if not_null.any():
          df.loc[not_null, f'{col}_encoded'] = le.fit_transform(df.loc[not_null, col])
          label_encoders[col] = {label: idx for idx, label in enumerate(le.classes_)}
      else:
          df[f'{col}_encoded'] = np.nan

  # 7. Save the standardized dataset
  timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
  standardized_file = f"{standardized_zone}/amazon_products_standardized_{timestamp}.parquet"
  df.to_parquet(standardized_file, index=False)

  log_operation(f"Standardized dataset saved at: {standardized_file}")

  # Create data dictionary
  log_operation("Generating data dictionary")

  # Basic structure for the data dictionary
  data_dict = []
  for col in df.columns:
      col_info = {
          'column_name': col,
          'data_type': str(df[col].dtype),
          'description': '',
          'sample_values': str(df[col].head(3).tolist()),
          'null_count': int(df[col].isna().sum()),
          'unique_values': int(df[col].nunique()),
      }

      if 'product_id' in col:
          col_info['description'] = 'Unique ID for each product'
      elif 'title' in col:
          col_info['description'] = 'Name of the product'
      elif 'bullet_points' in col:
          col_info['description'] = 'Main features of the product in list format'
      elif 'description' == col:
          col_info['description'] = 'Full description of the product'
      elif 'product_length' in col:
          col_info['description'] = 'Physical dimensions of the product'

      data_dict.append(col_info)

  # Save the data dictionary
  import json
  with open(f"{standardized_zone}/data_dictionary_{timestamp}.json", 'w') as f:
      json.dump(data_dict, f, indent=2)

  log_operation("Standardization process completed successfully")

  # Display information about the standardized dataset
  print("\nStandardized Dataset Information:")
  print(f"Number of records: {df.shape[0]}")
  print(f"Number of columns: {df.shape[1]}")
  print("\nNew columns created:")
  original_cols = set(pd.read_parquet(landing_file_path).columns)
  new_cols = set(df.columns) - original_cols
  print(list(new_cols))
except Exception as e:
    log_operation(f"Error during creation of the Standardized Zone: {str(e)}")

[2025-03-03 13:23:48.971724] Searching for the most recent file in the Landing Zone
[2025-03-03 13:23:48.972013] Raw file found: /content/data_lake/landing_zone/amazon_products_raw_20250303_132347.parquet
[2025-03-03 13:23:49.089005] Dataset loaded successfully. Dimensions: (100, 6)
[2025-03-03 13:23:49.089055] Standardizing column names
[2025-03-03 13:23:49.089291] Starting data cleaning and standardization
[2025-03-03 13:23:49.089821] Cleaning text column: title
[2025-03-03 13:23:49.092802] Cleaning text column: bullet_points
[2025-03-03 13:23:49.098930] Cleaning text column: description
[2025-03-03 13:23:49.111136] Handling missing values
[2025-03-03 13:23:49.113339] Extracting features from text
[2025-03-03 13:23:49.116127] Encoding categorical variables
[2025-03-03 13:23:49.150973] Standardized dataset saved at: /content/data_lake/standardized_zone/amazon_products_standardized_20250303_132349.parquet
[2025-03-03 13:23:49.151063] Generating data dictionary
[2025-03-03 13:23:49.1576

## Staging Zone

In [7]:
staging_zone = f'{data_lake_root}/staging_zone'
os.makedirs(staging_zone, exist_ok=True)

# Find the most recent file in the standardized zone
log_operation("Finding the most recent file in the Standardized Zone")

standardized_files = [f for f in os.listdir(standardized_zone) if f.endswith('.parquet')]
latest_file = sorted(standardized_files)[-1]
standardized_file_path = f"{standardized_zone}/{latest_file}"

log_operation(f"File found: {standardized_file_path}")

# Load the dataset
df = pd.read_parquet(standardized_file_path)
log_operation(f"Dataset loaded successfully. Dimensions: {df.shape}")

try:
  # 1. Create a column that tracks which columns are null
  log_operation("Creating null tracking column")

  # Check for 'title', 'bullet_points', and 'description' columns
  # If they don't exist but similar columns do, map them
  column_mapping = {}

  # Check for title column
  title_candidates = ['title']
  for col in title_candidates:
      if col in df.columns:
          column_mapping['title'] = col
          break

  # Check for bullet_points column
  bullet_candidates = ['bullet_points']
  for col in bullet_candidates:
      if col in df.columns:
          column_mapping['bullet_points'] = col
          break

  # Check for description column
  desc_candidates = ['description']
  for col in desc_candidates:
      if col in df.columns:
          column_mapping['description'] = col
          break

  log_operation(f"Column mapping established: {column_mapping}")

  # Create working copies with standardized column names
  working_df = df.copy()

  # Create standardized column names in working dataframe
  for std_col, actual_col in column_mapping.items():
      if std_col != actual_col:
          working_df[std_col] = df[actual_col]

  # Ensure all three columns exist, create them if they don't
  for col in ['title', 'bullet_points', 'description']:
      if col not in working_df.columns:
          log_operation(f"Column {col} not found, creating empty column")
          working_df[col] = np.nan

  # Create the null tracking column
  working_df['null_columns'] = ''

  # Check which columns are null and update the null_columns
  for col in ['title', 'bullet_points', 'description']:
      # Add column name to null_columns if the value is null
      working_df.loc[working_df[col].isna() | (working_df[col] == ''), 'null_columns'] = \
        working_df.loc[working_df[col].isna() | (working_df[col] == ''), 'null_columns'] + col + ','

  # Remove trailing comma
  working_df['null_columns'] = working_df['null_columns'].str.rstrip(',')

  # 2. Copy content between columns when values are missing
  log_operation("Copying content between columns to fill missing values")

  # Copy bullet_points to description and vice versa when one is missing
  # First, convert all to string to avoid type issues
  for col in ['title', 'bullet_points', 'description']:
      working_df[col] = working_df[col].astype(str).replace('nan', '')

  # Copy bullet_points to description when description is empty
  working_df.loc[working_df['description'] == '', 'description'] = working_df.loc[working_df['description'] == '', 'bullet_points']

  # Copy description to bullet_points when bullet_points is empty
  working_df.loc[working_df['bullet_points'] == '', 'bullet_points'] = working_df.loc[working_df['bullet_points'] == '', 'description']

  # If both description and bullet_points are empty, copy title to both
  working_df.loc[(working_df['description'] == '') & (working_df['bullet_points'] == ''), 'description'] = \
      working_df.loc[(working_df['description'] == '') & (working_df['bullet_points'] == ''), 'title']

  working_df.loc[(working_df['description'] == '') & (working_df['bullet_points'] == ''), 'bullet_points'] = \
      working_df.loc[(working_df['description'] == '') & (working_df['bullet_points'] == ''), 'title']

  # 3. Detect language of the title
  log_operation("Detecting language of the title")

  # Function to detect language with error handling
  def detect_language(text):
      if not isinstance(text, str) or text.strip() == '':
          return 'unknown'

      try:
          return detect(text)
      except LangDetectException:
          return 'unknown'
      except Exception as e:
          log_operation(f"Error detecting language: {str(e)}")
          return 'unknown'

  # Apply language detection to title column
  log_operation("This might take a while for large datasets...")
  # working_df['title_language'] = working_df['title'].apply(detect_language)
  working_df['title_language'] = 'en'

  # 4. Calculate statistics on the enhancements
  log_operation("Calculating statistics on enhancements")

  # Count how many rows had content copied between columns
  description_filled = (working_df['description'] != '') & (working_df['null_columns'].str.contains('description'))
  bullet_points_filled = (working_df['bullet_points'] != '') & (working_df['null_columns'].str.contains('bullet_points'))

  stats = {
      'rows_processed': len(working_df),
      'rows_with_null_columns': (working_df['null_columns'] != '').sum(),
      'description_filled_from_other_columns': int(description_filled.sum()),
      'bullet_points_filled_from_other_columns': int(bullet_points_filled.sum()),
      'language_distribution': working_df['title_language'].value_counts().to_dict()
  }

  log_operation("Enhancement statistics:")
  for key, value in stats.items():
      if key != 'language_distribution':
          log_operation(f"- {key}: {value}")

  log_operation("Top 5 detected languages:")
  for lang, count in sorted(stats['language_distribution'].items(), key=lambda x: x[1], reverse=True)[:5]:
      log_operation(f"- {lang}: {count}")

  # 5. Save the enhanced dataset
  timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
  staging_file = f"{staging_zone}/amazon_products_staging_{timestamp}.parquet"
  working_df.to_parquet(staging_file, index=False)

  log_operation(f"Enhanced dataset saved to: {staging_file}")

  # 6. Save enhancement metadata and statistics
  metadata = {
      'source_file': standardized_file_path,
      'staging_file': staging_file,
      'timestamp': timestamp,
      'enhancements': [
          'null_columns_tracking',
          'cross_column_content_copying',
          'language_detection'
      ],
      'stats': {k: int(v) if isinstance(v, np.integer) else v for k, v in stats.items()},
      'column_mapping': column_mapping
  }

  with open(f"{staging_zone}/staging_metadata_{timestamp}.json", 'w') as f:
      json.dump(metadata, f, indent=2)

  log_operation("Enhancement process completed successfully")

  # Display sample rows to verify the enhancements
  print("\nSample rows from enhanced dataset:")
  display_cols = ['title', 'bullet_points', 'description', 'null_columns', 'title_language']
  print(working_df[display_cols].head().to_string())

  staging_rows_pct = ((description_filled | bullet_points_filled).sum() / len(working_df)) * 100
  print(f"\nPercentage of rows that received content enhancements: {staging_rows_pct:.2f}%")
except Exception as e:
  log_operation(f"Error during creation of the Staging Zone: {str(e)}")

[2025-03-03 13:23:49.218113] Finding the most recent file in the Standardized Zone
[2025-03-03 13:23:49.218783] File found: /content/data_lake/standardized_zone/amazon_products_standardized_20250303_132349.parquet
[2025-03-03 13:23:49.229941] Dataset loaded successfully. Dimensions: (100, 10)
[2025-03-03 13:23:49.231555] Creating null tracking column
[2025-03-03 13:23:49.231650] Column mapping established: {'title': 'title', 'bullet_points': 'bullet_points', 'description': 'description'}
[2025-03-03 13:23:49.245566] Copying content between columns to fill missing values
[2025-03-03 13:23:49.263824] Detecting language of the title
[2025-03-03 13:23:49.263858] This might take a while for large datasets...
[2025-03-03 13:23:49.264214] Calculating statistics on enhancements
[2025-03-03 13:23:49.275313] Enhancement statistics:
[2025-03-03 13:23:49.275350] - rows_processed: 100
[2025-03-03 13:23:49.275366] - rows_with_null_columns: 55
[2025-03-03 13:23:49.275376] - description_filled_from_ot

## Curated Zone

In [8]:
curated_zone = f'{data_lake_root}/curated_zone'
os.makedirs(curated_zone, exist_ok=True)

# Find the most recent file in the staging zone
log_operation("Finding the most recent file in the Staging Zone")

staging_files = [f for f in os.listdir(staging_zone) if f.endswith('.parquet')]
latest_file = sorted(staging_files)[-1]
staging_file_path = f"{staging_zone}/{latest_file}"

# Load the LLM model (optimized for Colab)
# pipe = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.2", device_map="auto")

# Path to Parquet dataset
parquet_path = staging_file_path
output_file = f"{curated_zone}/test.parquet"

[2025-03-03 13:23:49.303234] Finding the most recent file in the Staging Zone


In [None]:
def quick_test_extract(title, description, max_desc_length=200):
    print(f"cuda is available: {torch.cuda.is_available()}")
    print(f"tiny pipe: {tiny_pipe}")
    # Truncate description to prevent exceeding token limits
    truncated_description = description[:max_desc_length] + "..." if len(description) > max_desc_length else description

    prompt = f"""
    Extract structured features from the following product:

    Title: {title}
    Description: {truncated_description}

    Return JSON format:
    {{
      "category": "...",
      "material": "...",
      "features": {{
        "main_feature": "...",
        "additional_features": "..."
      }},
      "compatibility": "..."
    }}
    """

    # Set proper parameters to avoid warnings
    result = tiny_pipe(
        prompt,
        max_length=500,  # Reduced from 5000
        do_sample=False,
        truncation=True,  # Explicitly enable truncation
    )[0]["generated_text"]

    # Extract just the JSON part of the response
    try:
        # Try to find the JSON part (starts after the prompt)
        json_start = result.find('{\n  "category"')
        if json_start != -1:
            return result[json_start:]
        else:
            return result
    except Exception as e:
        return f"Error processing result: {e}\nRaw output: {result}"

# Initialize the pipeline with proper settings
def initialize_pipeline():
    model_name = "mistralai/Mistral-7B-Instruct-v0.2"  # Change to a better model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

    # Set the pad token to be the eos token if not set
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    return pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device_map="auto"
    )

# Test with one row
def test_extraction(parquet_path):
    global tiny_pipe
    tiny_pipe = initialize_pipeline()

    test_df = pd.read_parquet(parquet_path)
    row = test_df.iloc[0]

    print("Processing item:", row["title"])
    features = quick_test_extract(row["title"], row["description"])
    print(f"Test complete! Extracted: {features}")
    return features

test_extraction(parquet_path)

tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Device set to use cuda:0


Processing item: ArtzFolio Tulip Flowers Blackout Curtain for Door, Window Room Eyelets Tie Back Canvas Fabric Width 4.5feet 54inch Height 5 feet 60 inch ; Set of 2 PCS
cuda is available: True
tiny pipe: <transformers.pipelines.text_generation.TextGenerationPipeline object at 0x78836e924610>
