In [1]:
import cv2
import numpy as np
import pandas as pd
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.applications.resnet50 import preprocess_input
import os
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.preprocessing import OneHotEncoder
import joblib
from datetime import datetime

KeyboardInterrupt: 

In [None]:
# Load the cleaned structured data
train_data_cleaned = pd.read_pickle('processed_data/cleaned_data/train_data_cleaned.pkl')
test_data_cleaned = pd.read_pickle('processed_data/cleaned_data/test_data_cleaned.pkl')

# Display the first few rows to confirm
print(train_data_cleaned.head())
print(test_data_cleaned.head())

print(train_data_cleaned.shape)
print(test_data_cleaned.shape)

print(train_data_cleaned.dtypes)
print(test_data_cleaned.dtypes)

In [None]:
print(train_data_cleaned['Expiry_date'].isnull().sum())  # To check if there are any missing paths

In [None]:
print(test_data_cleaned['Expiry_date'].isnull().sum())  # To check if there are any missing paths

In [None]:
print(train_data_cleaned['Insurance_company'].isnull().sum())  # Count missing values

In [None]:
train_data_cleaned.describe

### Turning Expiry_Date from an object to numeric value

In [None]:
print(test_data_cleaned['Expiry_date'].dtype)  # Check column data type
print(test_data_cleaned['Expiry_date'].head())  # Inspect the first few values
print(train_data_cleaned['Expiry_date'].dtype)  # Check column data type
print(train_data_cleaned['Expiry_date'].head())  # Inspect the first few values

In [None]:
def convert_expiry_date(df, date_column='Expiry_date'):
    """
    Converts the 'Expiry_date' column from object to string, then standardizes the format to DD-MM-YYYY,
    converts it to datetime, and finally converts it to numeric (epoch time).

    Parameters:
    df (DataFrame): The dataframe containing the date column.
    date_column (str): The column name containing the expiry date.

    Returns:
    DataFrame: The dataframe with the converted expiry date in numeric format (epoch time).
    """

    # Step 1: Ensure the column is treated as a string
    df[date_column] = df[date_column].astype(str)

    # Step 2: Detect the date format (YYYY-MM-DD or DD-MM-YYYY)
    sample_date = df[date_column].dropna().iloc[0]  # Get a non-null sample date
    
    if '-' in sample_date:
        date_parts = sample_date.split('-')
        if len(date_parts[0]) == 4:  # YYYY-MM-DD format detected
            print("Detected YYYY-MM-DD format. Converting to DD-MM-YYYY.")
            df[date_column] = pd.to_datetime(df[date_column], format='%Y-%m-%d', errors='coerce').dt.strftime('%d-%m-%Y')
        else:
            print("Detected DD-MM-YYYY format. No conversion needed.")

    # Step 3: Convert to datetime format
    df[date_column] = pd.to_datetime(df[date_column], format='%d-%m-%Y', errors='coerce')

    # Step 4: Convert datetime to numeric (epoch time)
    df[date_column] = df[date_column].astype('int64') // 10**9  # Convert to Unix timestamp (seconds)

    print(f"Converted {date_column} to epoch time successfully.")
    return df


In [None]:
train_data_cleaned = convert_expiry_date(train_data_cleaned)
test_data_cleaned = convert_expiry_date(test_data_cleaned)
print(train_data_cleaned['Expiry_date'])
print(test_data_cleaned['Expiry_date'])

In [None]:
# Check rows where Expiry_date is invalid
invalid_dates = train_data_cleaned[train_data_cleaned['Expiry_date'].isna()]
print(invalid_dates[['Expiry_date']])

In [None]:
print(train_data_cleaned.dtypes)
print(test_data_cleaned.dtypes)

In [None]:
print(train_data_cleaned['Expiry_date'].isnull().sum())  # To check if there are any missing paths

In [None]:
print(test_data_cleaned['Expiry_date'].isnull().sum())  # To check if there are any missing paths

In [None]:
print(train_data_cleaned['Expiry_date'])

### Clipping Amount

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Compute Q1 (25th percentile) and Q3 (75th percentile)
Q1 = train_data_cleaned['Amount'].quantile(0.25)
Q3 = train_data_cleaned['Amount'].quantile(0.75)

# Compute IQR
IQR = Q3 - Q1

# Define lower and upper bounds
lower_bound = max(0,Q1 - 1.5 * IQR)
upper_bound = Q3 + 1.5 * IQR

print(f"Lower Bound: {lower_bound}, Upper Bound: {upper_bound}")

# Clip values
train_data_cleaned['Amount'] = np.clip(train_data_cleaned['Amount'], lower_bound, upper_bound)

# Plot new box plot
plt.figure(figsize=(8, 4))
sns.boxplot(x=train_data_cleaned['Amount'])
plt.title("Boxplot for Amount (After Clipping)")
plt.show()


### Feature Scaling

In [None]:
# Initialize scalers
feature_scaler = StandardScaler()
amount_scaler = StandardScaler()

# Define feature columns (excluding 'Amount' initially)
feature_columns = ['Cost_of_vehicle', 'Min_coverage', 'Max_coverage', 'Expiry_date']

# Fit and transform feature columns
train_data_cleaned[feature_columns] = feature_scaler.fit_transform(train_data_cleaned[feature_columns])

# Apply log transformation BEFORE scaling
train_data_cleaned['Amount'] = np.log1p(train_data_cleaned['Amount'])  # log(Amount + 1)

# Scale 'Amount' separately
train_data_cleaned['Amount'] = amount_scaler.fit_transform(train_data_cleaned[['Amount']].values.reshape(-1, 1))

# Print shape and check values
print(train_data_cleaned.shape)
print(train_data_cleaned.tail())

# Save scalers
joblib.dump(feature_scaler, 'scalers/feature_scaler.pkl')
joblib.dump(amount_scaler, 'scalers/amount_scaler.pkl')


In [None]:
# Apply the same transformation to test data (except 'Amount', which is not available)
test_data_cleaned[feature_columns] = feature_scaler.transform(test_data_cleaned[feature_columns])
print(test_data_cleaned.tail())

In [None]:
train_data_cleaned['Amount'].describe()

### One-Hot Encoding Insurance company

In [None]:
ENCODER_PATH = "scalers/encoder.pkl"

In [None]:
# **Load and apply One-Hot Encoding**
if not os.path.exists(ENCODER_PATH):
    raise FileNotFoundError(f"Encoder file not found: {ENCODER_PATH}")
    
with open(ENCODER_PATH, "rb") as f:
        encoder = joblib.load(f)


In [None]:
# Fit the encoder on the training data and transform the 'Insurance_company' column in the train dataset
train_data_encoded = encoder.transform(train_data_cleaned[['Insurance_company']])

In [None]:
# Convert the encoded data back to a DataFrame
train_data_encoded = pd.DataFrame(train_data_encoded, columns=encoder.get_feature_names_out(['Insurance_company']))

In [None]:
print(train_data_encoded.shape)
print(train_data_encoded.tail())

In [None]:
# Drop the original 'Insurance_company' column from train_data_cleaned
train_data_cleaned = train_data_cleaned.drop(columns=['Insurance_company'])

In [None]:
# Join the one-hot encoded columns with the original DataFrame
train_data_cleaned = train_data_cleaned.join(train_data_encoded)

In [None]:
print(train_data_cleaned.shape)
print(train_data_cleaned.tail())

In [None]:
# Transform the 'Insurance_company' column in the test dataset using the same encoder
test_data_encoded = encoder.transform(test_data_cleaned[['Insurance_company']])

In [None]:
# Convert the transformed data back to a DataFrame
test_data_encoded = pd.DataFrame(test_data_encoded, columns=encoder.get_feature_names_out(['Insurance_company']))

In [None]:
# Drop the original 'Insurance_company' column from test_data_cleaned
test_data_cleaned = test_data_cleaned.drop(columns=['Insurance_company'])

In [None]:
# Join the one-hot encoded columns with the test dataset
test_data_cleaned = test_data_cleaned.join(test_data_encoded)

In [None]:
print(train_data_cleaned.shape)
print(test_data_cleaned.shape)

In [None]:
print(train_data_cleaned.tail())
print(test_data_cleaned.tail())

In [None]:
print(train_data_cleaned['Insurance_company_A'].isnull().sum())

### Image Preprocessing

In [None]:
# import os

# # Folder containing all images
# image_folder = "Fast_Furious_Insured/images/train_images"

# # Get the list of images that are still relevant
# valid_images = set(train_data_cleaned["Image_path"].apply(os.path.basename))  # Extract filenames only

# # List all files in the folder
# all_images = set(os.listdir(image_folder))

# # Find extra images that need to be deleted
# extra_images = all_images - valid_images

# # Delete extra images
# for img in extra_images:
#     img_path = os.path.join(image_folder, img)
#     os.remove(img_path)  # Deletes the file
#     print(f"Deleted: {img_path}")

# print(f"✅ Removed {len(extra_images)} unnecessary images.")


In [None]:
#No. of Images
train_images_path = "images/train_images"
test_images_path = "images/test_images"

num_train_images = len(os.listdir(train_images_path))
num_test_images = len(os.listdir(test_images_path))

print(f"Number of training images: {num_train_images}")
print(f"Number of test images: {num_test_images}")

In [None]:
IMG_SIZE = (224, 224)  # ResNet50 requires images to be 224x224

def preprocess_image(image_path):
    # Load the image using OpenCV
    image = cv2.imread(image_path)
    
    # Resize the image to the target size (224x224 for ResNet50)
    image_resized = cv2.resize(image, IMG_SIZE)
    
    # Convert image to float32 for normalization
    image_normalized = image_resized.astype('float32') / 255.0
    
    # Preprocess the image using ResNet50 preprocessing (mean subtraction, etc.)
    image_processed = preprocess_input(image_normalized)
    
    return image_processed


In [None]:
# Load the pre-trained ResNet50 model (exclude top layers for feature extraction)
base_model = ResNet50(weights='imagenet', include_top=False, pooling='avg')

def extract_image_features(image_path):
    # Preprocess the image
    image_processed = preprocess_image(image_path)
    
    # Expand dimensions for batch processing (ResNet50 expects a batch)
    image_batch = np.expand_dims(image_processed, axis=0)
    
    # Extract features using the ResNet50 model
    features = base_model.predict(image_batch)
    
    # Flatten the feature vector
    return features.flatten()


In [None]:
def combine_features(image_paths, structured_data):
    image_features = []
    
    # Iterate over all image paths and extract features
    for path in image_paths:
        features = extract_image_features(path)
        image_features.append(features)
    
    # Convert image features into a DataFrame
    image_features_df = pd.DataFrame(image_features, index=structured_data.index)

     # Print to check before merging
    print("Image Features Shape:", image_features_df.shape)
    print("Structured Data Shape:", structured_data.shape)
    
    # Concatenate image features with structured data (merge on index)
    combined_data = structured_data.join(image_features_df)
    
    return combined_data


In [None]:
# Get the image paths from the 'Image_Path' column
train_image_paths = train_data_cleaned['Image_path'].tolist()
test_image_paths = test_data_cleaned['Image_path'].tolist()

In [None]:
# Combine image features with the structured data
train_combined = combine_features(train_image_paths, train_data_cleaned)



In [None]:
#test_combined = combine_features(test_image_paths, test_data_cleaned)

In [None]:
train_combined.shape

In [None]:
test_combined.shape

In [None]:
# # Extract image filenames from DataFrame
# train_images_used = set(train_data_cleaned["Image_path"].apply(lambda x: x.split("/")[-1]))

# # List actual image files in the folder
# import os
# image_folder = "Fast_Furious_Insured/images/train_images"
# all_image_files = set(os.listdir(image_folder))

# # Find extra images that are not in train_data_cleaned
# extra_images = all_image_files - train_images_used

# print(f"Extra images processed: {len(extra_images)}")
# print(f"Sample extra images: {list(extra_images)[:5]}")

In [None]:
# import os

# # Get the list of image filenames in the folder
# image_folder = "Fast_Furious_Insured/images/train_images"
# all_image_files = set(os.listdir(image_folder))

# # Extract filenames from the train dataset
# train_image_filenames = set(train_data_cleaned["Image_path"].apply(lambda x: os.path.basename(x)))

# # Identify extra images (those in the folder but not in the dataset)
# extra_images = all_image_files - train_image_filenames
# missing_images = train_image_filenames - all_image_files

# print(f"Extra images in the folder: {len(extra_images)}")
# print(f"Missing images: {len(missing_images)}")


In [None]:
# # Filter out rows where the image file doesn't exist
# valid_image_paths = set(train_data_cleaned["Image_path"].apply(lambda x: os.path.basename(x)))
# all_images_in_folder = set(os.listdir(image_folder))

# # Keep only records with matching image files
# train_data_cleaned_filtered = train_data_cleaned[train_data_cleaned["Image_path"].apply(
#     lambda x: os.path.basename(x) in all_images_in_folder
# )]

# print(f"Filtered train data shape: {train_data_cleaned_filtered.shape}")


In [None]:
# Check the number of unique image paths in the cleaned dataset and processed data
print(f"Unique image paths in train_data_cleaned: {train_data_cleaned['Image_path'].nunique()}")
print(f"Unique image paths in processed train_combined: {train_combined['Image_path'].nunique()}")


In [None]:
print(test_combined['Image_path'].isnull().sum())  # To check if there are any missing paths


In [None]:
# Check for duplicates based on all columns in the combined dataset
duplicates = train_combined[train_combined.duplicated(subset='Image_path', keep=False)]  # Keep=False to mark all duplicates
print(duplicates)


In [None]:
# Get the indexes of duplicated rows
duplicate_indexes = train_combined[train_combined.duplicated(subset='Image_path',keep=False)].index
print("Duplicate indexes:", duplicate_indexes)

In [None]:
print(f"Length of duplicate_indexes: {len(duplicate_indexes)}")
print(f"Length of train_image_paths: {len(train_image_paths)}")
print(f"Shape of train_combined: {train_combined.shape}")

In [None]:
train_combined.to_csv('processed_data/train_combined.csv', index=False)
test_combined.to_csv('processed_data/test_combined.csv', index=False)

In [None]:
test_combined.shape

In [None]:
# Save train_combined as pickle file
train_combined.to_pickle('processed_data/final_train_data.pkl')

# Save test_combined as pickle file
test_combined.to_pickle('processed_data/final_test_data.pkl')

print("Pickle files saved successfully!")