In [1]:
import cv2
import numpy as np
import pandas as pd
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.applications.resnet50 import preprocess_input
import os

In [2]:
# Load the cleaned structured data
train_data_cleaned = pd.read_pickle('Fast_Furious_Insured/processed_data/train_data_cleaned.pkl')
test_data_cleaned = pd.read_pickle('Fast_Furious_Insured/processed_data/test_data_cleaned.pkl')

# Display the first few rows to confirm
print(train_data_cleaned.head())
print(test_data_cleaned.head())

print(train_data_cleaned.shape)
print(test_data_cleaned.shape)


                                          Image_path Insurance_company  \
0  Fast_Furious_Insured/images/train_images/img_4...                BQ   
1  Fast_Furious_Insured/images/train_images/img_7...                BQ   
2  Fast_Furious_Insured/images/train_images/img_4...                 A   
3  Fast_Furious_Insured/images/train_images/img_7...                 A   
4  Fast_Furious_Insured/images/train_images/img_7...                AC   

   Cost_of_vehicle  Min_coverage Expiry_date  Max_coverage  Condition  Amount  
0          41500.0        1037.5  03-12-2026      36142.68          0     0.0  
1          50700.0        1267.5  10-07-2025      12753.00          1  6194.0  
2          49500.0        1237.5  11-08-2022      43102.68          0     0.0  
3          33500.0         837.5  02-08-2022       8453.00          1  7699.0  
4          27600.0         690.0  01-05-2026       6978.00          1  8849.0  
                                          Image_path Insurance_company  \
0

In [3]:
# import os

# # Folder containing all images
# image_folder = "Fast_Furious_Insured/images/train_images"

# # Get the list of images that are still relevant
# valid_images = set(train_data_cleaned["Image_path"].apply(os.path.basename))  # Extract filenames only

# # List all files in the folder
# all_images = set(os.listdir(image_folder))

# # Find extra images that need to be deleted
# extra_images = all_images - valid_images

# # Delete extra images
# for img in extra_images:
#     img_path = os.path.join(image_folder, img)
#     os.remove(img_path)  # Deletes the file
#     print(f"Deleted: {img_path}")

# print(f"✅ Removed {len(extra_images)} unnecessary images.")


In [4]:
#No. of Images
train_images_path = "Fast_Furious_Insured/images/train_images"
test_images_path = "Fast_Furious_Insured/images/test_images"

num_train_images = len(os.listdir(train_images_path))
num_test_images = len(os.listdir(test_images_path))

print(f"Number of training images: {num_train_images}")
print(f"Number of test images: {num_test_images}")

Number of training images: 1310
Number of test images: 600


In [5]:
IMG_SIZE = (224, 224)  # ResNet50 requires images to be 224x224

def preprocess_image(image_path):
    # Load the image using OpenCV
    image = cv2.imread(image_path)
    
    # Resize the image to the target size (224x224 for ResNet50)
    image_resized = cv2.resize(image, IMG_SIZE)
    
    # Convert image to float32 for normalization
    image_normalized = image_resized.astype('float32') / 255.0
    
    # Preprocess the image using ResNet50 preprocessing (mean subtraction, etc.)
    image_processed = preprocess_input(image_normalized)
    
    return image_processed


In [6]:
# Load the pre-trained ResNet50 model (exclude top layers for feature extraction)
base_model = ResNet50(weights='imagenet', include_top=False, pooling='avg')

def extract_image_features(image_path):
    # Preprocess the image
    image_processed = preprocess_image(image_path)
    
    # Expand dimensions for batch processing (ResNet50 expects a batch)
    image_batch = np.expand_dims(image_processed, axis=0)
    
    # Extract features using the ResNet50 model
    features = base_model.predict(image_batch)
    
    # Flatten the feature vector
    return features.flatten()


In [7]:
def combine_features(image_paths, structured_data):
    image_features = []
    
    # Iterate over all image paths and extract features
    for path in image_paths:
        features = extract_image_features(path)
        image_features.append(features)
    
    # Convert image features into a DataFrame
    image_features_df = pd.DataFrame(image_features, index=structured_data.index)

     # Print to check before merging
    print("Image Features Shape:", image_features_df.shape)
    print("Structured Data Shape:", structured_data.shape)
    
    # Concatenate image features with structured data (merge on index)
    combined_data = pd.concat([structured_data.reset_index(drop=True), image_features_df], axis=1)
    
    return combined_data


In [8]:
# Get the image paths from the 'Image_Path' column
train_image_paths = train_data_cleaned['Image_path'].tolist()
test_image_paths = test_data_cleaned['Image_path'].tolist()

In [38]:
# Combine image features with the structured data
train_combined = combine_features(train_image_paths, train_data_cleaned)



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 713ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 209ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 211ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 190ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 242ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 247ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 165ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 188ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 167ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 208ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 233ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 200ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 239ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [10]:
test_combined = combine_features(test_image_paths, test_data_cleaned)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 272ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 264ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 290ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 271ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 265ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 263ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 277ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 260ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 260ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 252ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 268ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 236ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 259ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [39]:
train_combined.shape

(1396, 2056)

In [40]:
test_combined.shape

(600, 2054)

In [41]:
# # Extract image filenames from DataFrame
# train_images_used = set(train_data_cleaned["Image_path"].apply(lambda x: x.split("/")[-1]))

# # List actual image files in the folder
# import os
# image_folder = "Fast_Furious_Insured/images/train_images"
# all_image_files = set(os.listdir(image_folder))

# # Find extra images that are not in train_data_cleaned
# extra_images = all_image_files - train_images_used

# print(f"Extra images processed: {len(extra_images)}")
# print(f"Sample extra images: {list(extra_images)[:5]}")

In [42]:
print(len(train_image_paths), len(set(train_image_paths)))  # Both should be 1310

1310 1310


In [15]:
import os

# Get the list of image filenames in the folder
image_folder = "Fast_Furious_Insured/images/train_images"
all_image_files = set(os.listdir(image_folder))

# Extract filenames from the train dataset
train_image_filenames = set(train_data_cleaned["Image_path"].apply(lambda x: os.path.basename(x)))

# Identify extra images (those in the folder but not in the dataset)
extra_images = all_image_files - train_image_filenames
missing_images = train_image_filenames - all_image_files

print(f"Extra images in the folder: {len(extra_images)}")
print(f"Missing images: {len(missing_images)}")


Extra images in the folder: 0
Missing images: 0


In [16]:
# Filter out rows where the image file doesn't exist
valid_image_paths = set(train_data_cleaned["Image_path"].apply(lambda x: os.path.basename(x)))
all_images_in_folder = set(os.listdir(image_folder))

# Keep only records with matching image files
train_data_cleaned_filtered = train_data_cleaned[train_data_cleaned["Image_path"].apply(
    lambda x: os.path.basename(x) in all_images_in_folder
)]

print(f"Filtered train data shape: {train_data_cleaned_filtered.shape}")


Filtered train data shape: (1310, 8)


In [43]:
# Check the number of unique image paths in the cleaned dataset and processed data
print(f"Unique image paths in train_data_cleaned: {train_data_cleaned['Image_path'].nunique()}")
print(f"Unique image paths in processed train_combined: {train_combined['Image_path'].nunique()}")


Unique image paths in train_data_cleaned: 1310
Unique image paths in processed train_combined: 1310


In [23]:
print(test_combined['Image_path'].isnull().sum())  # To check if there are any missing paths


0


In [44]:
# Check for duplicates based on all columns in the combined dataset
duplicates = train_combined[train_combined.duplicated(subset='Image_path', keep=False)]  # Keep=False to mark all duplicates
print(duplicates)


     Image_path Insurance_company  Cost_of_vehicle  Min_coverage Expiry_date  \
1310        NaN               NaN              NaN           NaN         NaN   
1311        NaN               NaN              NaN           NaN         NaN   
1312        NaN               NaN              NaN           NaN         NaN   
1313        NaN               NaN              NaN           NaN         NaN   
1314        NaN               NaN              NaN           NaN         NaN   
...         ...               ...              ...           ...         ...   
1393        NaN               NaN              NaN           NaN         NaN   
1394        NaN               NaN              NaN           NaN         NaN   
1395        NaN               NaN              NaN           NaN         NaN   
1396        NaN               NaN              NaN           NaN         NaN   
1397        NaN               NaN              NaN           NaN         NaN   

      Max_coverage  Condition  Amount  

In [45]:
# Get the indexes of duplicated rows
duplicate_indexes = train_combined[train_combined.duplicated(subset='Image_path',keep=False)].index
print("Duplicate indexes:", duplicate_indexes)

Duplicate indexes: Index([1310, 1311, 1312, 1313, 1314, 1315, 1316, 1317, 1318, 1319, 1320, 1322,
       1323, 1324, 1325, 1326, 1327, 1328, 1329, 1330, 1331, 1332, 1333, 1334,
       1335, 1336, 1337, 1338, 1339, 1340, 1341, 1342, 1343, 1344, 1345, 1346,
       1347, 1348, 1349, 1350, 1351, 1352, 1353, 1354, 1355, 1356, 1357, 1358,
       1359, 1360, 1361, 1362, 1363, 1364, 1365, 1366, 1367, 1368, 1369, 1370,
       1371, 1373, 1374, 1375, 1376, 1377, 1378, 1379, 1380, 1381, 1382, 1383,
       1384, 1385, 1386, 1387, 1388, 1389, 1390, 1391, 1392, 1393, 1394, 1395,
       1396, 1397],
      dtype='int64')


In [46]:
print(f"Length of duplicate_indexes: {len(duplicate_indexes)}")
print(f"Length of train_image_paths: {len(train_image_paths)}")
print(f"Shape of train_combined: {train_combined.shape}")

Length of duplicate_indexes: 86
Length of train_image_paths: 1310
Shape of train_combined: (1396, 2056)


In [52]:
# Drop duplicates from train_combined based on the image paths (or any other criteria)
train_combined = train_combined.drop_duplicates(keep=False)

# Verify the new shape
print(f"New shape of train_combined: {train_combined.shape}")


New shape of train_combined: (1311, 2056)


In [50]:
# Check for duplicates based on all columns in the combined dataset
duplicates = train_combined[train_combined.duplicated(keep=False)]  # Keep=False to mark all duplicates
print(duplicates)


Empty DataFrame
Columns: [Image_path, Insurance_company, Cost_of_vehicle, Min_coverage, Expiry_date, Max_coverage, Condition, Amount, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, ...]
Index: []

[0 rows x 2056 columns]


In [53]:
# Save the dataframe to a CSV file
train_combined.to_csv('train_combined.csv', index=False)


In [54]:
print(train_combined.tail(1))  # View the last row to confirm the issue

     Image_path Insurance_company  Cost_of_vehicle  Min_coverage Expiry_date  \
1310        NaN               NaN              NaN           NaN         NaN   

      Max_coverage  Condition  Amount    0    1  ...  2038  2039      2040  \
1310           NaN        NaN     NaN  0.0  0.0  ...   0.0   0.0  0.262955   

      2041  2042  2043  2044  2045     2046  2047  
1310   0.0   0.0   0.0   0.0   0.0  0.00072   0.0  

[1 rows x 2056 columns]


In [55]:
train_combined = train_combined.drop(train_combined.index[-1])

In [56]:
train_combined.shape

(1310, 2056)

In [57]:
print(len(train_image_paths))  # Should be 1310
print(train_combined.shape[0])  # Should also be 1310

1310
1310


In [58]:
import os

# Check if all image paths in 'train_combined' exist
invalid_image_paths = [path for path in train_combined['Image_path'] if not os.path.exists(path)]

# Print out any invalid image paths
if invalid_image_paths:
    print("Invalid image paths:")
    for path in invalid_image_paths:
        print(path)
else:
    print("All image paths are valid.")


All image paths are valid.


In [60]:
test_combined.shape

(600, 2054)

In [62]:
# Save train_combined as pickle file
train_combined.to_pickle('Fast_Furious_Insured/processed_data/train_combined.pkl')

# Save test_combined as pickle file
test_combined.to_pickle('Fast_Furious_Insured/processed_data/test_combined.pkl')

print("Pickle files saved successfully!")

Pickle files saved successfully!
