In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import requests
from PIL import Image, ImageFile, UnidentifiedImageError
from io import BytesIO
import torch
from torchvision import models, transforms
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import random  # For random selection of units

# Allow loading of truncated images
ImageFile.LOAD_TRUNCATED_IMAGES = True

# List of possible units to assign to predictions
UNITS = ["kilogram", "pound", "watt", "kilovolt", "inch", "meter", "yard", "foot", "ton", "centimeter"]

# Step 1: Download Image with a Counter for 1000 images
def download_image(image_url, counter, max_images=1000):
    if counter[0] >= max_images:
        return None  # Stop downloading if the limit is reached

    try:
        response = requests.get(image_url, timeout=10)  # Set a timeout to avoid hanging requests
        counter[0] += 1  # Increment the counter for each image (valid or invalid)
        
        if response.status_code == 200 and len(response.content) > 0:  # Ensure content size is valid
            img = Image.open(BytesIO(response.content))
            return img
        else:
            print(f"Failed to download image from {image_url}, status code: {response.status_code}, content size: {len(response.content)}")
            return None
    except (requests.exceptions.RequestException, UnidentifiedImageError, OSError) as e:
        print(f"Error downloading or opening image from {image_url}: {e}")
        return None

# Step 2: Preprocess the Image (Handle Grayscale to RGB conversion)
def preprocess_image(image, image_size=(224, 224)):
    # Check if the image is in grayscale (1 channel) and convert to RGB (3 channels)
    if image.mode != 'RGB':
        image = image.convert('RGB')
    
    preprocess = transforms.Compose([
        transforms.Resize(image_size),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    
    return preprocess(image).unsqueeze(0)

# Step 3: Extract Image Features
def extract_image_features(img_tensor, model):
    """
    Pass the preprocessed image tensor through the model (feature extractor)
    to get image features.
    """
    with torch.no_grad():
        features = model(img_tensor)
    return features.squeeze().numpy()  # Convert to numpy array

# Step 4: Prepare the Data for Training with a 1000 Image Limit
def prepare_training_data(train_df, model, image_size=(224, 224), max_images=1000):
    X, y = [], []
    counter = [0]  # A mutable counter to track the number of images processed

    for idx, row in train_df.iterrows():
        if counter[0] >= max_images:
            break  # Stop processing after 1000 images
        
        try:
            # Only process rows where entity_value exists and is in a valid format
            if pd.notna(row['entity_value']) and isinstance(row['entity_value'], str):
                entity_value = row['entity_value'].split()[0]  # Extract the numeric value part
                entity_value = float(entity_value)  # Convert to float

                # Download and preprocess the image
                img = download_image(row['image_link'], counter, max_images)
                if img is None:
                    continue  # Skip if image could not be downloaded or opened

                img_tensor = preprocess_image(img, image_size)

                # Extract image features
                features = extract_image_features(img_tensor, model)

                X.append(features)
                y.append(entity_value)
        except (ValueError, IndexError) as e:
            print(f"Skipping row {idx} due to error: {e}")
            continue  # Skip rows with invalid entity_value or image errors
    
    return np.array(X), np.array(y)

# Step 5: Train the Model
def train_model(X_train, y_train):
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    return model

# Step 6: Make Predictions on Test Data with 1000 Image Limit
def make_predictions(test_df, model, feature_extractor, image_size=(224, 224), max_images=1000):
    predictions = []
    counter = [0]  # A mutable counter to track the number of images processed

    for idx, row in test_df.iterrows():
        if counter[0] >= max_images:
            predictions.append('')  # After 1000 predictions, append empty values
            continue

        img = download_image(row['image_link'], counter, max_images)
        if img is None:
            predictions.append('')  # Append empty value for failed downloads
            continue

        img_tensor = preprocess_image(img, image_size)
        features = extract_image_features(img_tensor, feature_extractor)
        prediction = model.predict([features])[0]

        # Randomly select a unit and format the prediction
        unit = random.choice(UNITS)
        predictions.append(f"{prediction:.2f} {unit}")

    # Append empty predictions for any remaining rows after the 1000 limit
    while len(predictions) < len(test_df):
        predictions.append('')
    
    return predictions

# Step 7: Save Output in the Required Format
def save_predictions(test_df, predictions, output_file='test_out.csv'):
    # Convert predictions to string type to avoid any issues with numeric values
    predictions = [str(p).strip() if p != '' else '' for p in predictions]
    
    # Ensure that the predictions list has the same length as the test dataframe
    output_df = pd.DataFrame({'index': test_df['index'], 'prediction': predictions})
    
    # Save to CSV
    output_df.to_csv(output_file, index=False)
    print(f"Predictions saved to {output_file}")

# Main Pipeline
if __name__ == '__main__':
    # Load training and test data
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')

    # Load a pre-trained model (ResNet50 for feature extraction)
    feature_extractor = models.resnet50(pretrained=True)
    feature_extractor = torch.nn.Sequential(*(list(feature_extractor.children())[:-1]))  # Remove final classification layer

    # Prepare training data (limit to 1000 images)
    X, y = prepare_training_data(train_df, feature_extractor)

    # Train-test split for validation
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train a RandomForest model
    model = train_model(X_train, y_train)

    # Validate the model (optional)
    y_pred = model.predict(X_val)
    print(f"Validation MSE: {mean_squared_error(y_val, y_pred)}")

    # Make predictions on test data (limit to 1000 images)
    predictions = make_predictions(test_df, model, feature_extractor)

    # Save the predictions
    save_predictions(test_df, predictions)
