In [None]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
pip install paddlepaddle-gpu paddleocr

In [None]:
!pip install pandas requests pillow

In [None]:
# Initialize PaddleOCR model with GPU
from paddleocr import PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=True)

In [None]:
import pandas as pd
import requests
import os
from paddleocr import PaddleOCR
import concurrent.futures
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# Initialize PaddleOCR with GPU support
ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=True)

session = requests.Session()
retry = Retry(total=5, backoff_factor=0.5, status_forcelist=[429, 500, 502, 503, 504])
adapter = HTTPAdapter(max_retries=retry)
session.mount('https://', adapter)

def download_image(url, save_path):
    try:
        response = session.get(url, timeout=10)
        response.raise_for_status()  
        with open(save_path, 'wb') as file:
            file.write(response.content)
        return save_path
    except Exception as e:
        print(f"Failed to download {url}: {e}")
        return None


def extract_text_from_image(image_path):
    try:
        result = ocr.ocr(image_path, cls=True)
        extracted_text = ' '.join([line[1][0] for line in result[0]])
        return extracted_text
    except Exception as e:
        print(f"Failed to extract text from {image_path}: {e}")
        return ""

def process_row(index, row):
    image_url = row['image_link']
    image_name = f"images/image_{index}.jpg"
    
    # Download the image
    downloaded_image_path = download_image(image_url, image_name)
    
    if downloaded_image_path:
        # Extract text using OCR
        extracted_text = extract_text_from_image(downloaded_image_path)
        
        # Delete the image after processing
        os.remove(downloaded_image_path)
        
        return (index, extracted_text)
    return (index, "")

csv_file = '/kaggle/input/amazon-test/test.csv'  

df = pd.read_csv(csv_file)

start_index = 30001

df_subset = df.iloc[start_index:].copy()

df_subset['extracted_text'] = ''

os.makedirs('images', exist_ok=True)

MAX_THREADS = 8  

processed_rows = 0

with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
    # Submit tasks for each row in the subset dataframe
    futures = {executor.submit(process_row, index, row): index for index, row in df_subset.iterrows()}
    
    # As futures complete, update the dataframe with extracted text
    for future in concurrent.futures.as_completed(futures):
        index, extracted_text = future.result()
        df_subset.at[index, 'extracted_text'] = extracted_text
        processed_rows += 1
        
        # Save the DataFrame to a CSV file every 10,000 rows
        if processed_rows % 10000 == 0:
            updated_file_name = f'updated_file_{start_index + processed_rows}.csv'
            df_subset.to_csv(updated_file_name, index=False)
            print(f"Saved progress after {processed_rows} rows")


final_file_name = 'updated_file_Final.csv'
df_subset.to_csv(final_file_name, index=False)
print(f"Text extraction, image deletion, and CSV update complete! Final file saved as {final_file_name}")
