In [None]:
!pip install paddlepaddle paddleocr
!python3 -m pip install paddlepaddle-gpu

In [None]:
import paddle
print(paddle.utils.run_check())

In [None]:
!pip install torch_xla==2.5.0


In [None]:
import torch
import torch_xla
import torch_xla.core.xla_model as xm

# Initialize and set up the TPU
device = xm.xla_device()  # Get the TPU device
print(f"Using device: {device}")


In [None]:
# Importing necessary libraries
import pandas as pd
import requests
import cv2
import numpy as np
from io import BytesIO
from paddleocr import PaddleOCR

# Initialize PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=True)

# Load the CSV file
file_path = '/kaggle/input/amazon-train/train.csv'
df = pd.read_csv(file_path)

# Function to download image and perform OCR using OpenCV
def extract_text_from_image(url):
    try:
        # Download the image
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Check if the request was successful
        
        # Convert image content to NumPy array for OpenCV
        img_array = np.asarray(bytearray(response.content), dtype=np.uint8)
        img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)  # Load the image using OpenCV
        
        # If image is not loaded properly, log an error
        if img is None:
            print(f"Error loading image: {url}")
            return None
        
        # Use PaddleOCR to extract text
        result = ocr.ocr(img)
        
        # Collect the text from OCR result
        extracted_text = " ".join([line[1][0] for line in result[0]])
        return extracted_text

    except requests.exceptions.RequestException as req_err:
        print(f"Request error processing {url}: {req_err}")
        return None

    except Exception as e:
        print(f"Unexpected error processing {url}: {e}")
        return None

# Iterate over the DataFrame and process each image in batches
batch_size = 1000  # Save progress after every 1000 iterations
max_images = 50000  # Maximum number of images to process from the starting point (50,001 to 100,000)
start_index = 100001  # Starting index
end_index = 105001  # Ending index to avoid out-of-bounds error
processed_images = 0  # Track the number of processed images

# Adjust the loop to start from 50001 and end at 100000
for i in range(start_index, end_index, batch_size):
    batch_df = df.iloc[i:i+batch_size]  # Process images in batches
    batch_df['extracted_text'] = batch_df['image_link'].apply(extract_text_from_image)
    
    # Save the updated DataFrame to a CSV file after every batch
    if processed_images == 0:  # For the first batch
        batch_df.to_csv('test_temp.csv', mode='w', index=False, header=True)
    else:
        batch_df.to_csv('test_temp.csv', mode='a', index=False, header=False)

    processed_images += len(batch_df)  # Update the number of processed images
    print(f"Processed {processed_images} entries")

print("Processing complete!")


In [None]:
print("hello world")

In [None]:
import pandas as pd

# Load the CSV file with the extracted text
csv_file = '/kaggle/working/test_temp.csv'
df = pd.read_csv(csv_file)

# Drop rows where the 'extracted_text' field is empty or NaN
df_cleaned = df[df['extracted_text'].notna() & df['extracted_text'].str.strip().astype(bool)]

# Save the cleaned DataFrame to a new CSV file
df_cleaned.to_csv('cleaned_train_with_text.csv', index=False)

print(f"Rows with non-empty 'extracted_text' field saved to 'cleaned_train_with_text.csv'.")

In [None]:
!pip install transformers torch pandas


In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel

csv_file = '/kaggle/working/cleaned_train_with_text.csv'
df = pd.read_csv(csv_file)

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    cls_embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    return cls_embedding

df['bert_embedding'] = df['extracted_text'].apply(lambda x: get_bert_embedding(x) if isinstance(x, str) else None)

df.to_csv('bert_embeddings_train_with_text.csv', index=False)

print(f"BERT embeddings added and saved to 'bert_embeddings_train_with_text.csv'.")


In [None]:
import pandas as pd

csv_file = '/kaggle/working/bert_embeddings_train_with_text.csv'
df = pd.read_csv(csv_file)

# Extract the image name from the image_link field (last part after '/')
df['image_name'] = df['image_link'].apply(lambda x: x.split('/')[-1] if isinstance(x, str) else None)

# Save the DataFrame with the new image_name column
output_file = '/kaggle/working/train_with_image_name.csv'  # Save to working directory
df.to_csv(output_file, index=False)

print(f"Image names added and saved to '{output_file}'.")

# After running the code, you can download the CSV file from the 'Output' section of the Kaggle notebook interface.


In [None]:
# Install transformers if not already installed
!pip install transformers
import torch
from transformers import BertTokenizer, BertForSequenceClassification


In [None]:
from transformers import BartTokenizer, BartForSequenceClassification

# Specify the path where the model is saved
model_path = '/kaggle/input/bart-base/pytorch/default/1/junu'

# Load the BART tokenizer
tokenizer = BartTokenizer.from_pretrained(model_path)

# Load the BART model for sequence classification (or adjust for your task)
model = BartForSequenceClassification.from_pretrained(model_path)
model.eval()  # Set the model to evaluation mode


In [None]:
def preprocess(text, tokenizer, max_length=128):
    return tokenizer(text, padding='max_length', truncation=True, max_length=max_length, return_tensors="pt")


In [None]:
import pandas as pd

csv_file = '/kaggle/input/com-with-name/combined_train_with_image_name.csv'
df = pd.read_csv(csv_file)



In [None]:
import torch
from transformers import BartTokenizer, BartForConditionalGeneration
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration
from sklearn.model_selection import train_test_split

# Initialize the tokenizer and model
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')

# Example data
X = df['combined_text']
y = df['entity_value']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=104, test_size=0.25, shuffle=True)

def tokenize_function(text_list):
    return tokenizer(text_list, return_tensors="pt", padding=True, truncation=True, max_length=512)

input_tokens = [tokenize_function(x)['input_ids'].squeeze(0) for x in X_train]
target_tokens = [tokenize_function(y)['input_ids'].squeeze(0) for y in y_train]

In [None]:
import torch
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm

# Set the model to train mode
model.train()

# Send model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)  # Adjust learning rate if necessary

# Convert tokenized data to tensors and move to GPU
input_tokens = [x.to(device) for x in input_tokens]
target_tokens = [y.to(device) for y in target_tokens]

# Define a basic training loop
epochs = 20  # Adjust based on your needs
batch_size = 8  # Adjust batch size for performance

for epoch in range(epochs):
    total_loss = 0
    print(f'Epoch {epoch + 1}/{epochs}')

    # Create a tqdm progress bar for the batch loop
    for i in tqdm(range(0, len(input_tokens), batch_size), desc="Training Progress", unit="batch"):
        # Prepare inputs and targets batch with padding
        inputs_batch = pad_sequence(input_tokens[i:i + batch_size], batch_first=True, padding_value=tokenizer.pad_token_id)
        targets_batch = pad_sequence(target_tokens[i:i + batch_size], batch_first=True, padding_value=tokenizer.pad_token_id)

        optimizer.zero_grad()

        # Forward pass: get model predictions
        outputs = model(input_ids=inputs_batch, labels=targets_batch)

        # Loss is calculated automatically by model (cross-entropy)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass: compute gradients and update weights
        loss.backward()
        optimizer.step()

    # Display the average loss for this epoch
    print(f'Epoch {epoch + 1} completed. Average Loss: {total_loss / len(input_tokens)}')

print("Training complete")

In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Set the model to evaluation mode
model.eval()

# Prepare test inputs
input_tokens_test = [tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)['input_ids'].squeeze(0).to(device) for text in X_test]

# Initialize lists to store predicted and actual values
predicted_values = []
actual_values = list(y_test)  # Actual alphanumeric values for comparison

# Batch size for evaluation
batch_size = 8

# BLEU smoothing function
smooth_fn = SmoothingFunction().method4  # Handles cases where there is no overlap in n-grams

# Initialize variables for BLEU and accuracy
total_bleu_score = 0
correct_predictions = 0

# Loop through test data in batches
for i in tqdm(range(0, len(input_tokens_test), batch_size), desc="Evaluating"):
    inputs_batch = pad_sequence(input_tokens_test[i:i + batch_size], batch_first=True, padding_value=tokenizer.pad_token_id)

    # Generate predictions
    with torch.no_grad():
        generated_ids = model.generate(inputs_batch, max_length=20)

    # Decode predictions to text
    for idx, generated_id in enumerate(generated_ids):
        generated_text = tokenizer.decode(generated_id, skip_special_tokens=True)
        predicted_values.append(generated_text)

        # Calculate BLEU score for this prediction (reference is the actual value from y_test)
        actual_text = actual_values[i + idx]
        reference = [actual_text.split()]  # BLEU expects a list of reference sentences (each as a list of words)
        candidate = generated_text.split()  # Candidate translation

        # Calculate BLEU score for this sample
        bleu_score = sentence_bleu(reference, candidate, smoothing_function=smooth_fn)
        total_bleu_score += bleu_score

        # Simple exact match comparison
        if generated_text == actual_text:
            correct_predictions += 1

# Compute average BLEU score and accuracy
average_bleu_score = total_bleu_score / len(predicted_values)
accuracy = correct_predictions / len(predicted_values)

# Print a few examples with BLEU scores
for i in range(10):
    print(f"Predicted: {predicted_values[i]}, Actual: {actual_values[i]}")
    reference = [actual_values[i].split()]
    candidate = predicted_values[i].split()
    sample_bleu = sentence_bleu(reference, candidate, smoothing_function=smooth_fn)
    print(f"BLEU score for sample {i}: {sample_bleu:.4f}")

# Print final evaluation metrics
print(f"\nExact Match Accuracy: {accuracy * 100:.2f}%")
print(f"Average BLEU Score: {average_bleu_score:.4f}")
