In [None]:
import cv2
import numpy as np
import pandas as pd
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.applications.resnet50 import preprocess_input
import os
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import joblib

In [None]:
# Load the cleaned structured data
train_data_cleaned = pd.read_pickle('Fast_Furious_Insured/processed_data/train_data_cleaned.pkl')
test_data_cleaned = pd.read_pickle('Fast_Furious_Insured/processed_data/test_data_cleaned.pkl')

# Display the first few rows to confirm
print(train_data_cleaned.head())
print(test_data_cleaned.head())

print(train_data_cleaned.shape)
print(test_data_cleaned.shape)

print(train_data_cleaned.dtypes)
print(test_data_cleaned.dtypes)

### Turning Expiry_Date from an object to numeric value

In [None]:
# Convert 'Expiry_date' to datetime
train_data_cleaned['Expiry_date'] = pd.to_datetime(train_data_cleaned['Expiry_date'], errors='coerce')
test_data_cleaned['Expiry_date'] = pd.to_datetime(test_data_cleaned['Expiry_date'], errors='coerce')

In [None]:
# Now, convert to numeric (e.g., number of days since a reference date, such as '2000-01-01')
reference_date = pd.to_datetime('2000-01-01')

train_data_cleaned['Expiry_date'] = (train_data_cleaned['Expiry_date'] - reference_date).dt.days
test_data_cleaned['Expiry_date'] = (test_data_cleaned['Expiry_date'] - reference_date).dt.days

### Feature Scaling

In [None]:
# Define feature columns (excluding 'Amount' initially)
feature_columns = ['Cost_of_vehicle', 'Min_coverage', 'Max_coverage', 'Expiry_date']

# Initialize the scaler for features
feature_scaler = StandardScaler()

# Fit and transform feature columns
train_data_cleaned[feature_columns] = feature_scaler.fit_transform(train_data_cleaned[feature_columns])

# Scale 'Amount' separately using another scaler
amount_scaler = StandardScaler()
train_data_cleaned['Amount'] = amount_scaler.fit_transform(train_data_cleaned[['Amount']])

# Save the scalers for later use
joblib.dump(feature_scaler, 'feature_scaler.pkl')
joblib.dump(amount_scaler, 'amount_scaler.pkl')

print(train_data_cleaned)

In [None]:
# Apply the same transformation to test data (except 'Amount', which is not available)
test_data_cleaned[feature_columns] = feature_scaler.transform(test_data_cleaned[feature_columns])
print(test_data_cleaned)

### One-Hot Encoding Insurance company

In [None]:
# Initialize the encoder
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

# Fit the encoder on the training data and transform the 'Insurance_company' column in the train dataset
train_data_encoded = encoder.fit_transform(train_data_cleaned[['Insurance_company']])

In [None]:
# Convert the encoded data back to a DataFrame
train_data_encoded = pd.DataFrame(train_data_encoded, columns=encoder.get_feature_names_out(['Insurance_company']))

In [None]:
# Concatenate the encoded features with the rest of the train data (drop the original 'Insurance_company' column)
train_data_cleaned = pd.concat([train_data_cleaned.drop(columns=['Insurance_company']), train_data_encoded], axis=1)

In [None]:
# Transform the 'Insurance_company' column in the test dataset using the same encoder
test_data_encoded = encoder.transform(test_data_cleaned[['Insurance_company']])

In [None]:
# Convert the transformed data back to a DataFrame
test_data_encoded = pd.DataFrame(test_data_encoded, columns=encoder.get_feature_names_out(['Insurance_company']))

In [None]:
# Concatenate the encoded features with the rest of the test data (drop the original 'Insurance_company' column)
test_data_cleaned = pd.concat([test_data_cleaned.drop(columns=['Insurance_company']), test_data_encoded], axis=1)

In [None]:
print(train_data_cleaned.shape)
print(test_data_cleaned.shape)

### Image Preprocessing

In [None]:
# import os

# # Folder containing all images
# image_folder = "Fast_Furious_Insured/images/train_images"

# # Get the list of images that are still relevant
# valid_images = set(train_data_cleaned["Image_path"].apply(os.path.basename))  # Extract filenames only

# # List all files in the folder
# all_images = set(os.listdir(image_folder))

# # Find extra images that need to be deleted
# extra_images = all_images - valid_images

# # Delete extra images
# for img in extra_images:
#     img_path = os.path.join(image_folder, img)
#     os.remove(img_path)  # Deletes the file
#     print(f"Deleted: {img_path}")

# print(f"✅ Removed {len(extra_images)} unnecessary images.")


In [None]:
#No. of Images
train_images_path = "Fast_Furious_Insured/images/train_images"
test_images_path = "Fast_Furious_Insured/images/test_images"

num_train_images = len(os.listdir(train_images_path))
num_test_images = len(os.listdir(test_images_path))

print(f"Number of training images: {num_train_images}")
print(f"Number of test images: {num_test_images}")

In [None]:
IMG_SIZE = (224, 224)  # ResNet50 requires images to be 224x224

def preprocess_image(image_path):
    # Load the image using OpenCV
    image = cv2.imread(image_path)
    
    # Resize the image to the target size (224x224 for ResNet50)
    image_resized = cv2.resize(image, IMG_SIZE)
    
    # Convert image to float32 for normalization
    image_normalized = image_resized.astype('float32') / 255.0
    
    # Preprocess the image using ResNet50 preprocessing (mean subtraction, etc.)
    image_processed = preprocess_input(image_normalized)
    
    return image_processed


In [None]:
# Load the pre-trained ResNet50 model (exclude top layers for feature extraction)
base_model = ResNet50(weights='imagenet', include_top=False, pooling='avg')

def extract_image_features(image_path):
    # Preprocess the image
    image_processed = preprocess_image(image_path)
    
    # Expand dimensions for batch processing (ResNet50 expects a batch)
    image_batch = np.expand_dims(image_processed, axis=0)
    
    # Extract features using the ResNet50 model
    features = base_model.predict(image_batch)
    
    # Flatten the feature vector
    return features.flatten()


In [None]:
def combine_features(image_paths, structured_data):
    image_features = []
    
    # Iterate over all image paths and extract features
    for path in image_paths:
        features = extract_image_features(path)
        image_features.append(features)
    
    # Convert image features into a DataFrame
    image_features_df = pd.DataFrame(image_features, index=structured_data.index)

     # Print to check before merging
    print("Image Features Shape:", image_features_df.shape)
    print("Structured Data Shape:", structured_data.shape)
    
    # Concatenate image features with structured data (merge on index)
    combined_data = pd.concat([structured_data.reset_index(drop=True), image_features_df], axis=1)
    
    return combined_data


In [None]:
# Get the image paths from the 'Image_Path' column
train_image_paths = train_data_cleaned['Image_path'].tolist()
test_image_paths = test_data_cleaned['Image_path'].tolist()

In [None]:
# Combine image features with the structured data
train_combined = combine_features(train_image_paths, train_data_cleaned)



In [None]:
test_combined = combine_features(test_image_paths, test_data_cleaned)

In [None]:
train_combined.shape

In [None]:
test_combined.shape

In [None]:
# # Extract image filenames from DataFrame
# train_images_used = set(train_data_cleaned["Image_path"].apply(lambda x: x.split("/")[-1]))

# # List actual image files in the folder
# import os
# image_folder = "Fast_Furious_Insured/images/train_images"
# all_image_files = set(os.listdir(image_folder))

# # Find extra images that are not in train_data_cleaned
# extra_images = all_image_files - train_images_used

# print(f"Extra images processed: {len(extra_images)}")
# print(f"Sample extra images: {list(extra_images)[:5]}")

In [None]:
print(len(train_image_paths), len(set(train_image_paths)))  # Both should be 1310

In [None]:
import os

# Get the list of image filenames in the folder
image_folder = "Fast_Furious_Insured/images/train_images"
all_image_files = set(os.listdir(image_folder))

# Extract filenames from the train dataset
train_image_filenames = set(train_data_cleaned["Image_path"].apply(lambda x: os.path.basename(x)))

# Identify extra images (those in the folder but not in the dataset)
extra_images = all_image_files - train_image_filenames
missing_images = train_image_filenames - all_image_files

print(f"Extra images in the folder: {len(extra_images)}")
print(f"Missing images: {len(missing_images)}")


In [None]:
# Filter out rows where the image file doesn't exist
valid_image_paths = set(train_data_cleaned["Image_path"].apply(lambda x: os.path.basename(x)))
all_images_in_folder = set(os.listdir(image_folder))

# Keep only records with matching image files
train_data_cleaned_filtered = train_data_cleaned[train_data_cleaned["Image_path"].apply(
    lambda x: os.path.basename(x) in all_images_in_folder
)]

print(f"Filtered train data shape: {train_data_cleaned_filtered.shape}")


In [None]:
# Check the number of unique image paths in the cleaned dataset and processed data
print(f"Unique image paths in train_data_cleaned: {train_data_cleaned['Image_path'].nunique()}")
print(f"Unique image paths in processed train_combined: {train_combined['Image_path'].nunique()}")


In [None]:
print(test_combined['Image_path'].isnull().sum())  # To check if there are any missing paths


In [None]:
# Check for duplicates based on all columns in the combined dataset
duplicates = train_combined[train_combined.duplicated(subset='Image_path', keep=False)]  # Keep=False to mark all duplicates
print(duplicates)


In [None]:
# Get the indexes of duplicated rows
duplicate_indexes = train_combined[train_combined.duplicated(subset='Image_path',keep=False)].index
print("Duplicate indexes:", duplicate_indexes)

In [None]:
print(f"Length of duplicate_indexes: {len(duplicate_indexes)}")
print(f"Length of train_image_paths: {len(train_image_paths)}")
print(f"Shape of train_combined: {train_combined.shape}")

In [None]:
# Drop duplicates from train_combined based on the image paths (or any other criteria)
train_combined = train_combined.drop_duplicates(subset='Image_path',keep=False)

# Verify the new shape
print(f"New shape of train_combined: {train_combined.shape}")


In [None]:
train_combined.to_csv('train_combined.csv', index=False)

In [None]:
test_combined.shape

In [None]:
# Save train_combined as pickle file
train_combined.to_pickle('Fast_Furious_Insured/processed_data/final_train_data.pkl')

# Save test_combined as pickle file
test_combined.to_pickle('Fast_Furious_Insured/processed_data/final_test_data.pkl')

print("Pickle files saved successfully!")

In [None]:
train_combined.info()