# First Module: Data Cleaning and Feature Engineering

In [2]:
import pandas as pd
import numpy as np
import re
from category_encoders import TargetEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sentence_transformers import SentenceTransformer
from joblib import Parallel, delayed
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression

# Step 1: Load Data with Parallel Processing Using Joblib
def load_data(file):
    return pd.read_csv(file)

files = ['train.csv', 'test.csv']
data = Parallel(n_jobs=2)(delayed(load_data)(file) for file in files)
train_data, test_data = data

# Step 2: Fill Missing 'make' Values Using Advanced Techniques
# For training data
for idx in train_data[train_data['make'].isna()].index:
    similar_cars = train_data[(train_data['model'] == train_data.loc[idx, 'model']) & (~train_data['make'].isna())]
    if len(similar_cars) > 0:
        train_data.loc[idx, 'make'] = similar_cars['make'].mode()[0]
    else:
        train_data.loc[idx, 'make'] = 'unknown'

# For test data - using training data to fill missing 'make' values
for idx in test_data[test_data['make'].isna()].index:
    similar_cars = train_data[(train_data['model'] == test_data.loc[idx, 'model']) & (~train_data['make'].isna())]
    if len(similar_cars) > 0:
        test_data.loc[idx, 'make'] = similar_cars['make'].mode()[0]
    else:
        test_data.loc[idx, 'make'] = 'unknown'

# Step 3: Fill 'manufactured' before calculating 'car_age'
# First, fill based on 'make' and 'model' group median
train_data['manufactured'] = train_data.groupby(['make', 'model'])['manufactured'].transform(lambda x: x.fillna(x.median()))
test_data['manufactured'] = test_data.groupby(['make', 'model'])['manufactured'].transform(lambda x: x.fillna(x.median()))

# If there are still missing values, use KNNImputer for finer imputation
if train_data['manufactured'].isna().sum() > 0 or test_data['manufactured'].isna().sum() > 0:
    features_to_impute = ['manufactured']
    imputer = KNNImputer(n_neighbors=5)
    train_data[features_to_impute] = imputer.fit_transform(train_data[features_to_impute])
    test_data[features_to_impute] = imputer.transform(test_data[features_to_impute])

# Recalculate 'car_age' after filling 'manufactured'
for df in [train_data, test_data]:
    df['car_age'] = 2024 - df['manufactured']

# Step 4.0: Fill 'engine_cap', 'curb_weight', 'no_of_owners', 'manufactured' using group median
features_to_fill = ['engine_cap', 'curb_weight', 'no_of_owners']
for feature in features_to_fill:
    train_data[feature] = train_data.groupby(['make', 'model'])[feature].transform(lambda x: x.fillna(x.median()))
    test_data[feature] = test_data.groupby(['make', 'model'])[feature].transform(lambda x: x.fillna(x.median()))

features_to_impute = ['engine_cap', 'curb_weight', 'no_of_owners']
imputer = KNNImputer(n_neighbors=5)
train_data[features_to_impute] = imputer.fit_transform(train_data[features_to_impute])
test_data[features_to_impute] = imputer.transform(test_data[features_to_impute])

# Step 4.1 Calculate arf and omv based on formula
# ---------------------------------------------------------------
def calculate_arf(omv):
    if omv <= 20000:
        return omv
    elif 20000 < omv <= 40000:
        return 20000 + 1.4 * (omv - 20000)
    elif 40000 < omv <= 60000:
        return 20000 + 1.4 * 20000 + 1.9 * (omv - 40000)
    elif 60000 < omv <= 80000:
        return 20000 + 1.4 * 20000 + 1.9 * 20000 + 2.5 * (omv - 60000)
    else:
        return 20000 + 1.4 * 20000 + 1.9 * 20000 + 2.5 * 20000 + 3.2 * (omv - 80000)

# Function to calculate OMV based on ARF
def calculate_omv(arf):
    if arf <= 20000:
        return arf
    elif 20000 < arf <= 48000:
        return 20000 + (arf - 20000) / 1.4
    elif 48000 < arf <= 86000:
        return 40000 + (arf - 48000) / 1.9
    elif 86000 < arf <= 136000:
        return 60000 + (arf - 86000) / 2.5
    else:
        return 80000 + (arf - 136000) / 3.2

for df in [train_data, test_data]:
    df['arf'] = df.apply(lambda row: calculate_arf(row['omv']) if pd.isnull(row['arf']) and pd.notnull(row['omv']) else row['arf'], axis=1)
    df['omv'] = df.apply(lambda row: calculate_omv(row['arf']) if pd.isnull(row['omv']) and pd.notnull(row['arf']) else row['omv'], axis=1)

# ---------------------------------------------------------------

# Step 4.2: Fill missing values using group median
# For 'omv' and 'arf', use 'make' and 'model' group median to fill
features_to_group_impute = ['omv', 'arf']
for feature in features_to_group_impute:
    train_data[feature] = train_data.groupby(['make', 'model'])[feature].transform(lambda x: x.fillna(x.median()))
    test_data[feature] = test_data.groupby(['make', 'model'])[feature].transform(lambda x: x.fillna(x.median()))

features_to_impute = ['omv', 'arf']
imputer = KNNImputer(n_neighbors=5)
train_data[features_to_impute] = imputer.fit_transform(train_data[features_to_impute])
test_data[features_to_impute] = imputer.transform(test_data[features_to_impute])

# Step 4.3: Calculate Remaining Years of COE

# Remove duplicate columns
train_data = train_data.loc[:, ~train_data.columns.duplicated()]
test_data = test_data.loc[:, ~test_data.columns.duplicated()]

# Function to check if COE is extended
def is_coe_extended(title):
    """Check if the COE is extended based on the title."""
    return bool(re.findall(r'\(.*COE.*\)', title))

# Ensure 'reg_date' is in datetime format, specify format if known
for df in [train_data, test_data]:
    df['reg_date'] = pd.to_datetime(df['reg_date'], format='%d-%b-%Y', errors='coerce')

    # Create the 'COE_extended' column based on the title
    df['COE_extended'] = df['title'].apply(is_coe_extended)

    # Calculate maturity date based on whether COE is extended
    def calculate_maturity_date(row):
        if row['COE_extended']:
            # If COE is extended, check for specific maturity date in title
            match = re.findall(r'\d{2}/\d{4}', row['title'])
            if match:
                return pd.to_datetime(match[0], format='%m/%Y')
            else:
                # If no specific date in title, extend by 20 years
                return row['reg_date'] + pd.DateOffset(years=20)
        else:
            # Default maturity is 10 years
            return row['reg_date'] + pd.DateOffset(years=10)

    df['maturity_date'] = df.apply(lambda row: calculate_maturity_date(row), axis=1)

    # Calculate remaining years of COE
    def calculate_remaining_years(row):
        years_left = (row['maturity_date'] - pd.Timestamp('2024-06-20')).days / 365.25
        if years_left < -1:
            return years_left % 10
        elif years_left < 1:
            return 1 + years_left
        return round(years_left, 1)

    df['remaining_years_of_coe'] = df.apply(lambda row: calculate_remaining_years(row), axis=1)
    df['remaining_years_of_coe'] = df['remaining_years_of_coe'].clip(lower=0)  # Ensure no negative values

    # Drop intermediate columns
    df.drop(columns=['COE_extended', 'maturity_date'], inplace=True)

# Step 4.4: Fill 'depreciation' using Price-Based Calculation and Linear Regression
def fill_depreciation(train_data, test_data):
    # Step 1: Calculate depreciation using price for the training set
    train_data['depreciation'] = train_data.apply(
        lambda row: row['price'] / row['remaining_years_of_coe'] if pd.notna(row['price']) and row['remaining_years_of_coe'] > 0 else row['depreciation'],
        axis=1
    )

    # Step 2: Fill missing depreciation in the test set based on median depreciation of similar make and model in the training set
    for idx in test_data[test_data['depreciation'].isna()].index:
        similar_cars = train_data[(train_data['make'] == test_data.loc[idx, 'make']) & (train_data['model'] == test_data.loc[idx, 'model'])]
        if len(similar_cars) > 0:
            test_data.loc[idx, 'depreciation'] = similar_cars['depreciation'].median()

    # Step 3: Use Linear Regression to fill any remaining missing values
    features = ['engine_cap', 'car_age', 'omv', 'remaining_years_of_coe']
    train_non_missing = train_data[~train_data['depreciation'].isna()]
    train_missing = train_data[train_data['depreciation'].isna()]

    model = LinearRegression()
    model.fit(train_non_missing[features], train_non_missing['depreciation'])

    # Predict for training set
    if not train_missing.empty:
        train_data.loc[train_data['depreciation'].isna(), 'depreciation'] = model.predict(train_missing[features])

    # Predict for test set
    test_missing = test_data[test_data['depreciation'].isna()]
    if not test_missing.empty:
        test_data.loc[test_data['depreciation'].isna(), 'depreciation'] = model.predict(test_missing[features])

    return train_data, test_data

train_data, test_data = fill_depreciation(train_data, test_data)


# Step 4.5: Fill 'mileage' using Linear Regression with related features
def fill_mileage(train_data, test_data):
    features = ['car_age', 'no_of_owners']
    mileage_train = train_data[~train_data['mileage'].isna()]
    mileage_missing = train_data[train_data['mileage'].isna()]

    model = LinearRegression()
    model.fit(mileage_train[features], mileage_train['mileage'])

    # Predict for training set
    train_data.loc[train_data['mileage'].isna(), 'mileage'] = model.predict(mileage_missing[features])

    # Predict for test set
    mileage_test_missing = test_data[test_data['mileage'].isna()]
    test_data.loc[test_data['mileage'].isna(), 'mileage'] = model.predict(mileage_test_missing[features])

    return train_data, test_data

train_data, test_data = fill_mileage(train_data, test_data)

# Step 4.6: Fill 'power' using Linear Regression with related features
def fill_power(train_data, test_data):
    features = ['engine_cap', 'curb_weight']
    power_train = train_data[~train_data['power'].isna()]
    power_missing = train_data[train_data['power'].isna()]

    model = LinearRegression()
    model.fit(power_train[features], power_train['power'])

    # Predict for training set
    train_data.loc[train_data['power'].isna(), 'power'] = model.predict(power_missing[features])

    # Predict for test set
    power_test_missing = test_data[test_data['power'].isna()]
    test_data.loc[test_data['power'].isna(), 'power'] = model.predict(power_test_missing[features])

    return train_data, test_data

train_data, test_data = fill_power(train_data, test_data)

# Step 4.7: Recalculate 'power_to_weight' after filling 'power' and 'curb_weight'
for df in [train_data, test_data]:
    df['power_to_weight'] = np.log1p(df['power'] / df['curb_weight'].replace(0, 1))

# Step 4.8: Fill 'road_tax' using Linear Regression with related features
def fill_road_tax(train_data, test_data):
    features = ['engine_cap']
    road_tax_train = train_data[~train_data['road_tax'].isna()]
    road_tax_missing = train_data[train_data['road_tax'].isna()]

    model = LinearRegression()
    model.fit(road_tax_train[features], road_tax_train['road_tax'])

    # Predict for training set
    train_data.loc[train_data['road_tax'].isna(), 'road_tax'] = model.predict(road_tax_missing[features])

    # Predict for test set
    road_tax_test_missing = test_data[test_data['road_tax'].isna()]
    test_data.loc[test_data['road_tax'].isna(), 'road_tax'] = model.predict(road_tax_test_missing[features])

    return train_data, test_data

train_data, test_data = fill_road_tax(train_data, test_data)

# Step 4.9: Use KNNImputer for finer imputation of remaining missing values
features_to_impute = ['depreciation','mileage', 'power', 'road_tax']
imputer = KNNImputer(n_neighbors=5)
train_data[features_to_impute] = imputer.fit_transform(train_data[features_to_impute])
test_data[features_to_impute] = imputer.transform(test_data[features_to_impute])

# Step 5: Text Feature Processing - TF-IDF and Embeddings
text_features = ['title', 'description', 'category']

# TF-IDF and SVD for Text Features
for feature in text_features:
    # Fill NaN and remove any rows that end up empty after filling
    train_data[feature] = train_data[feature].fillna('').str.strip()
    test_data[feature] = test_data[feature].fillna('').str.strip()
    
    # TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(min_df=1, max_features=5000, token_pattern=r'\b\w+\b', stop_words=None)
    train_tfidf = vectorizer.fit_transform(train_data[feature])
    test_tfidf = vectorizer.transform(test_data[feature])
    
    # Truncated SVD for Dimensionality Reduction
    n_components = min(50, train_tfidf.shape[1])  # Set n_components based on actual feature count
    svd = TruncatedSVD(n_components=n_components, random_state=42)
    train_svd = svd.fit_transform(train_tfidf)
    test_svd = svd.transform(test_tfidf)
    
    # Create DataFrames for the new SVD features
    svd_feature_names = [f'{feature}_svd_{i}' for i in range(n_components)]
    train_svd_df = pd.DataFrame(train_svd, columns=svd_feature_names, index=train_data.index)
    test_svd_df = pd.DataFrame(test_svd, columns=svd_feature_names, index=test_data.index)
    
    # Concatenate the new features to the original DataFrames
    train_data = pd.concat([train_data, train_svd_df], axis=1)
    test_data = pd.concat([test_data, test_svd_df], axis=1)

# Sentence Embedding for Deeper Text Understanding
model = SentenceTransformer('all-mpnet-base-v2')
for feature in text_features:
    train_embeddings = model.encode(train_data[feature].tolist(), show_progress_bar=True)
    test_embeddings = model.encode(test_data[feature].tolist(), show_progress_bar=True)
    
    # PCA for Dimensionality Reduction
    pca = PCA(n_components=20)
    train_pca = pca.fit_transform(train_embeddings)
    test_pca = pca.transform(test_embeddings)
    
    # Create DataFrames for the new embedding features
    embed_feature_names = [f'{feature}_embed_{i}' for i in range(20)]
    train_embed_df = pd.DataFrame(train_pca, columns=embed_feature_names, index=train_data.index)
    test_embed_df = pd.DataFrame(test_pca, columns=embed_feature_names, index=test_data.index)
    
    # Concatenate the new features to the original DataFrames
    train_data = pd.concat([train_data, train_embed_df], axis=1)
    test_data = pd.concat([test_data, test_embed_df], axis=1)


# Step 6: Categorical Feature Encoding (Target Encoding + Frequency Encoding)
categorical_features = ['make', 'model', 'type_of_vehicle', 'fuel_type', 'transmission']
encoder_make = TargetEncoder(smoothing=0.5)
encoder_model = TargetEncoder(smoothing=0.5)

encoder_make.fit(train_data['make'], train_data['price'])
encoder_model.fit(train_data['model'], train_data['price'])

train_data['make_encoded'] = encoder_make.transform(train_data['make'])
test_data['make_encoded'] = encoder_make.transform(test_data['make'])
train_data['model_encoded'] = encoder_model.transform(train_data['model'])
test_data['model_encoded'] = encoder_model.transform(test_data['model'])

# Frequency Encoding
for feature in categorical_features:
    freq_encode = train_data[feature].value_counts(normalize=True).to_dict()
    train_data[f'{feature}_freq'] = train_data[feature].map(freq_encode)
    test_data[f'{feature}_freq'] = test_data[feature].map(freq_encode)

# Step 7: Feature Interactions and Polynomial Features
from sklearn.preprocessing import PolynomialFeatures

# Handle missing values in selected features for interaction
interaction_columns = ['make_encoded', 'model_encoded', 'depreciation', 'car_age']
train_data[interaction_columns] = train_data[interaction_columns].fillna(0)
test_data[interaction_columns] = test_data[interaction_columns].fillna(0)

# Apply PolynomialFeatures
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
interaction_features = poly.fit_transform(train_data[interaction_columns])
interaction_df = pd.DataFrame(interaction_features, columns=poly.get_feature_names_out(interaction_columns), index=train_data.index)
train_data = pd.concat([train_data, interaction_df], axis=1)

interaction_features_test = poly.transform(test_data[interaction_columns])
interaction_df_test = pd.DataFrame(interaction_features_test, columns=poly.get_feature_names_out(interaction_columns), index=test_data.index)
test_data = pd.concat([test_data, interaction_df_test], axis=1)

# Step 8: Data Augmentation for Imbalanced Classes
luxury_sports_cars = train_data[train_data['type_of_vehicle'].isin(['luxury sedan', 'sports car'])]
augmented_data = luxury_sports_cars.copy()

# Adding small noise to numerical features
noise_scale = 0.01
for col in ['price', 'mileage', 'power', 'depreciation', 'omv', 'arf', 'car_age']:
    noise = np.random.normal(0, noise_scale, size=augmented_data[col].shape)
    augmented_data[col] = augmented_data[col] * (1 + noise)

# Append augmented data to train_data
train_data = pd.concat([train_data, augmented_data], ignore_index=True)

# Remove duplicate columns
train_data = train_data.loc[:, ~train_data.columns.duplicated()]
test_data = test_data.loc[:, ~test_data.columns.duplicated()]

# Step 9: Calculate Predicted Price Using a Given Formula
# Converts columns to numeric types and fills in missing values
columns_to_numeric = ['arf', 'omv', 'depreciation', 'remaining_years_of_coe']
for col in columns_to_numeric:
    if col in train_data.columns:
        train_data[col] = pd.to_numeric(train_data[col], errors='coerce').fillna(0)
    if col in test_data.columns:
        test_data[col] = pd.to_numeric(test_data[col], errors='coerce').fillna(0)
'''
# Debug
for col in columns_to_numeric:
    if col in train_data.columns:
        print(f"Column: {col}")
        print(train_data[col].head())
        print(train_data[col].dtype)
'''
def calculate_list_price(row):
    # Check if remaining_years_of_coe is a missing value or less than or equal to 0.
    remaining_years = row['remaining_years_of_coe']
    if pd.isna(remaining_years) or remaining_years <= 0:
        return 0

    # Make sure reg_date is converted to a date format
    reg_date = pd.to_datetime(row['reg_date'], errors='coerce')
    if pd.isna(reg_date):
        return 0

    arf = row['arf']
    omv = row['omv']
    depreciation = row['depreciation']

    # Ensure that arf, omv, depreciation are all numeric and not NaN
    if pd.isna(arf) or pd.isna(omv) or pd.isna(depreciation):
        return 0

    # Calculate the minimum PARF value
    if reg_date >= pd.Timestamp('2013-03-01'):
        min_parf_value = 0.5 * arf
    elif reg_date >= pd.Timestamp('2008-03-01'):
        min_parf_value = 0.5 * min(omv, arf)
    else:
        min_parf_value = 0.55 * min(omv, arf)

    # Calculate the list price
    # list_price = depreciation * remaining_years + min_parf_value
    list_price = depreciation * remaining_years
    if 'parf car' in row['title']:
      list_price += min_parf_value
    list_price = max(list_price - 380, 0)  # Ensure that the list price is not negative
    return list_price * 0.946

# Calculate the forecast price using the apply method
train_data['calculated_price'] = train_data.apply(lambda row: calculate_list_price(row), axis=1)
test_data['calculated_price'] = test_data.apply(lambda row: calculate_list_price(row), axis=1)

# Step 10.1: Fill Missing Values for 'dereg_value'
def fill_dereg_value(df, group_features=['make', 'model'], target_feature='dereg_value'):
    # Fill missing 'dereg_value' using median of the same 'make' and 'model'
    for group, group_df in df.groupby(group_features):
        median_value = group_df[target_feature].median()
        missing_idx = df[(df['make'] == group[0]) & (df['model'] == group[1]) & (df[target_feature].isna())].index
        df.loc[missing_idx, target_feature] = median_value

    # If there are still missing values, fill them with overall median
    df[target_feature] = df[target_feature].fillna(df[target_feature].median())
    return df

train_data = fill_dereg_value(train_data)
test_data = fill_dereg_value(test_data)

# Step 10.2: Normalize 'dereg_value'
scaler = StandardScaler()
train_data['dereg_value_normalized'] = scaler.fit_transform(train_data[['dereg_value']])
test_data['dereg_value_normalized'] = scaler.transform(test_data[['dereg_value']])

# Step 10: Save Cleaned Data
train_data.to_pickle('cleaned_train_data.pkl')
test_data.to_pickle('cleaned_test_data.pkl')
print('Clean data files generated.')

'''
# Check 2
missing_values = train_data.isna().mean()
print("Missing values ratio in train_data:")
print(missing_values[missing_values > 0].sort_values(ascending=False))

missing_values_test = test_data.isna().mean()
print("Missing values ratio in test_data:")
print(missing_values_test[missing_values_test > 0].sort_values(ascending=False))
'''

# Check
required_columns = ['arf', 'depreciation', 'car_age', 'power', 'coe', 'road_tax', 'mileage', 'omv',
        'power_to_weight', 'engine_cap','make_encoded', 'model_encoded',
        'calculated_price', 'dereg_value_normalized',]
for col in required_columns:
    if col not in train_data.columns:
        print(f"Warning: {col} is missing from train_data")
    if col not in test_data.columns:
        print(f"Warning: {col} is missing from test_data")





  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, ou

Clean data files generated.


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)


# Second Module:  ResNet Model Training

In [None]:
import tensorflow as tf
from sklearn.model_selection import KFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

# Load cleaned data
train_data = pd.read_pickle('cleaned_train_data_v2.pkl')
test_data = pd.read_pickle('cleaned_test_data_v2.pkl')
'''
# Step 1: Create additional features
train_data['depreciation_age'] = train_data['depreciation'] * train_data['car_age']
train_data['make_model_depreciation_age'] = train_data['make_encoded'] * train_data['model_encoded'] * train_data['depreciation'] * train_data['car_age']

test_data['depreciation_age'] = test_data['depreciation'] * test_data['car_age']
test_data['make_model_depreciation_age'] = test_data['make_encoded'] * test_data['model_encoded'] * test_data['depreciation'] * test_data['car_age']
'''
# Function to calculate RMSE
def calculate_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Preprocessing function for training data
def preprocess_data(data):
    # Selected features based on SHAP analysis
    selected_features = [
        'arf', 'depreciation', 'car_age', 'power', 'coe', 'road_tax', 'mileage', 
        'calculated_price', 'dereg_value_normalized', 'power_to_weight', 'engine_cap'
    ]

    # Retain useful interaction features based on related features
    interaction_features = [col for col in data.columns if 'interaction' in col]
    selected_features += interaction_features

    # Handle missing values by imputing with mode (most frequent)
    imputer = SimpleImputer(strategy='most_frequent')
    X = imputer.fit_transform(data[selected_features])
    y = data['price']  # Use the original target variable

    # Standardize the features
    scaler = StandardScaler()
    X_normalized = scaler.fit_transform(X)
    
    return X_normalized, y, imputer, scaler

# Preprocess the data
X, y, imputer, scaler = preprocess_data(train_data)

# Define ResNet block
def resnet_block(input_layer, filters, kernel_size=3):
    x = tf.keras.layers.Conv1D(filters, kernel_size, padding='same', activation='relu')(input_layer)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Conv1D(filters, kernel_size, padding='same')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Add()([x, input_layer])
    x = tf.keras.layers.Activation('relu')(x)
    return x

# Define ResNet model
def build_resnet_model(input_dim):
    inputs = tf.keras.Input(shape=(input_dim, 1))
    x = tf.keras.layers.Conv1D(64, 3, padding='same', activation='relu')(inputs)
    x = tf.keras.layers.BatchNormalization()(x)

    # Add multiple ResNet blocks
    for _ in range(2):  # 可以减少或增加 block 数量
        x = resnet_block(x, 64)

    x = tf.keras.layers.GlobalAveragePooling1D()(x)
    x = tf.keras.layers.Dense(128, activation='relu')(x)
    x = tf.keras.layers.Dense(64, activation='relu')(x)
    outputs = tf.keras.layers.Dense(1, activation='linear')(x)

    model = tf.keras.Model(inputs, outputs)
    model.compile(loss='mean_squared_error',
                  optimizer=tf.keras.optimizers.AdamW(learning_rate=0.001),
                  metrics=[tf.keras.metrics.RootMeanSquaredError()])
    return model

# Reshape input data for Conv1D
X = X.reshape((X.shape[0], X.shape[1], 1))

# Set up cross-validation using KFold for regression
kf = KFold(n_splits=5, shuffle=True, random_state=42)
batch_size = 512
epochs = 1000  # 减少 epoch，避免过拟合
rmse_list = []

# Define ReduceLROnPlateau for fold training
reduce_lr_fold = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_root_mean_squared_error',
    factor=0.5,
    patience=10,  # 减少 patience，避免浪费过多 epoch
    min_lr=1e-6,
    verbose=1
)

# Define EarlyStopping for fold training
early_stopping_fold = tf.keras.callbacks.EarlyStopping(
    monitor='val_root_mean_squared_error',
    patience=20,  # 减少 patience，适时停止
    min_delta=0.01,
    verbose=1
)

# Define ReduceLROnPlateau for final training
reduce_lr_final = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='root_mean_squared_error',  # Monitor training loss during final training
    factor=0.5,         # Reduce learning rate by 50% when training loss does not improve
    patience=10,        # Number of epochs to wait before reducing learning rate
    min_lr=1e-6,        # Minimum learning rate
    verbose=1           # Output learning rate reduction information
)

# Define EarlyStopping for final training
early_stopping_final = tf.keras.callbacks.EarlyStopping(
    monitor='root_mean_squared_error',  # Monitor training loss during final training
    patience=20,         # Number of epochs to wait before stopping
    min_delta=0.01,      # Minimum change to qualify as an improvement
    verbose=1            # Output stopping information
)

# Cross-validation training
for train_index, val_index in kf.split(X):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # Build and train the ResNet model
    model = build_resnet_model(X_train.shape[1])
    history = model.fit(X_train, y_train, 
                        validation_data=(X_val, y_val),
                        epochs=epochs,
                        batch_size=batch_size,
                        callbacks=[reduce_lr_fold, early_stopping_fold],
                        verbose=1)

    # Predict on validation set and calculate RMSE
    y_val_pred = model.predict(X_val).flatten()
    y_val_pred = np.clip(y_val_pred, a_min=y_val.min(), a_max=y_val.max())
    rmse = calculate_rmse(y_val, y_val_pred)
    rmse_list.append(rmse)
    print(f'Fold RMSE: {rmse}')

# Calculate average RMSE across folds
average_rmse = np.mean(rmse_list)
print(f'Average Validation RMSE: {average_rmse}')

def preprocess_test_data(data, imputer, scaler):
    selected_features = [
        'arf', 'depreciation', 'car_age', 'power', 'coe', 'road_tax', 'mileage', 
        'calculated_price', 'dereg_value_normalized', 'power_to_weight', 'engine_cap'
    ]

     # Retain useful interaction features based on related features
    interaction_features = [col for col in data.columns if 'interaction' in col]
    selected_features += interaction_features

    # Apply the imputer and scaler to the test data
    X = imputer.transform(data[selected_features])
    X_normalized = scaler.transform(X)
    
    return X_normalized

X_test = preprocess_test_data(test_data, imputer, scaler)
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

# Train final model on the full dataset
final_model = build_resnet_model(X.shape[1])
history = final_model.fit(X, y, 
                          epochs=epochs,
                          batch_size=batch_size,
                          callbacks=[reduce_lr_final, early_stopping_final],
                          verbose=1)

# Save the final trained model
final_model.save('optimized_final_resnet_model.h5')
print('Model saved as optimized_final_resnet_model.h5')

# Predict on test data
y_test_pred = final_model.predict(X_test).flatten()
y_test_pred = np.clip(y_test_pred, a_min=train_data['price'].min(), a_max=train_data['price'].max())  # Clip predictions to prevent extreme values

# Prepare the submission file
submission = pd.DataFrame({
    'Id': test_data.index,
    'Predicted': y_test_pred
})
submission = submission.sort_values(by='Id')
submission.to_csv('submission.csv', index=False)
print('Submission file generated: submission.csv')
