# 3. Convolutional Neural Network


In [8]:
import os
import cv2
import numpy as np
from keras.applications import VGG16
from keras.applications.vgg16 import preprocess_input
import pandas as pd

def compute_CNN(image, resize_dim=(224, 224)):
    # Resize the image
    resized_image = cv2.resize(image, resize_dim)
    # Preprocess input for VGG16 model
    preprocessed_image = preprocess_input(resized_image)
    # Expand dimensions to match VGG input shape
    preprocessed_image = np.expand_dims(preprocessed_image, axis=0)
    # Use pre-trained VGG16 model
    model = VGG16(weights='imagenet', include_top=False)
    # Extract features
    features = model.predict(preprocessed_image)
    # Flatten the features
    flattened_features = features.flatten()
    return flattened_features

# Function to save CNN features along with car name and image name to a CSV file
def save_to_csv(data, filename):
    np.savetxt(filename, data, delimiter=',', fmt='%s')

# Path to the folder containing images
folder_path = "images/"
# Load the CSV containing car name and image name mapping
csv_data = np.genfromtxt('Datasets/raw/CarsWithStocksPrices.csv', delimiter=',', dtype=str, skip_header=1)

# Initialize list to store extracted features
feature_data = []

# Iterate over each entry in the mapping CSV
for entry in csv_data:
    # Extract car name and image name
    car_name = entry[0]
    image_name = entry[4]
    price = entry[3]
    stock = entry[2]
    # Construct full path to the image
    image_path = os.path.join(folder_path, image_name+  '.jpg')
    print("Image path:", image_path)  # Print the image path for debugging
    # Load the image
    image = cv2.imread(image_path)
    if image is None:
        print("Error: Unable to load image", image_path)
        continue
    # Compute CNN features
    cnn_features = compute_CNN(image)
    # Append car name, image name, and features to the list
    feature_data.append(np.hstack((car_name, image_name, price, stock, cnn_features)))

# Save the feature data to CSV
save_to_csv(feature_data, 'Datasets/CNN_car_features.csv')

# Read the CSV file into a DataFrame
df = pd.read_csv('Datasets/CNN_car_features.csv')
df = df.iloc[:, 1:]

# Rename the columns
df.columns = ["CarName", "Price","Stock"] + [f"Feature_{i}" for i in range(1, len(df.columns) - 2)]

# Save the DataFrame back to the same CSV file
df.to_csv('Datasets/CNN_car_features.csv', index=False)



print("Features saved to CNN_car_features.csv")


Image path: images/BMW 1 Series 2019.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 463ms/step
Image path: images/BMW 2 Series Active Tourer 2022.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 467ms/step
Image path: images/BMW 2 Series Gran Coupe 2020.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 531ms/step
Image path: images/BMW 3 Series 2014.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 499ms/step
Image path: images/BMW 4 Series 2015.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 853ms/step
Image path: images/BMW 4 Series Convertible 2021.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 514ms/step
Image path: images/BMW 5 Series 2017.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 524ms/step
Image path: images/BMW 6 Series 2017.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 495ms/step
Image path: images/BMW 7 Series 2015.jpg
[1m1/1[0

  df = pd.read_csv('Datasets/CNN_car_features.csv')


Features saved to CNN_car_features.csv


# Predict Stock price using CNN Features


First apply PCA to reduce dimension

In [19]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

# Load the dataset
input_csv = 'Datasets/CNN_car_features.csv'
output_csv = 'Datasets/CNN_car_featuresPCA.csv'

print("Loading dataset...")
df = pd.read_csv(input_csv)
print(f"Dataset loaded with shape: {df.shape}")

# Separating out the features for PCA (starting from the 5th column to the end)
X = df.iloc[:, 3:].values

# Perform PCA
n_components = 199  # Adjust the number of components based on your needs
print(f"Performing PCA to reduce dimensions to {n_components}...")
pca = PCA(n_components=n_components)
principalComponents = pca.fit_transform(X)

# Creating a DataFrame with the first 5 columns and the principal components
print("Creating DataFrame with the first 5 columns and principal components...")
principalDf = pd.DataFrame(data=principalComponents, columns=[f'PC{i+1}' for i in range(n_components)])

# Combine the first 5 columns with the principal components
result_df = pd.concat([df.iloc[:, :3], principalDf], axis=1)

# Saving the new dataset to a CSV file
print(f"Saving the new dataset to {output_csv}...")
result_df.to_csv(output_csv, index=False)
print("Process completed successfully.")


Loading dataset...


  df = pd.read_csv(input_csv)


Dataset loaded with shape: (398, 25091)
Performing PCA to reduce dimensions to 199...
Creating DataFrame with the first 5 columns and principal components...
Saving the new dataset to Datasets/CNN_car_featuresPCA.csv...
Process completed successfully.


Dimension reduced from 1600 to 200

Predict Stock price using Machine learning model

In [8]:
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the CSV file into a DataFrame
df = pd.read_csv('Datasets/CNN_car_featuresPCA.csv')

# Identify the numerical columns
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns

# Standardize all numerical columns
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Specify the directory to save the file
output_directory = 'Datasets/CNN'
os.makedirs(output_directory, exist_ok=True)  # Create the directory if it doesn't exist

# Save the standardized DataFrame to a new CSV file
output_file = os.path.join(output_directory, 'CNN_car_featuresPCAstand.csv')
df.to_csv(output_file, index=False)

print("Standardization complete. The standardized data has been saved to 'Standardized_Dataset.csv' in the '{}' directory.".format(output_directory))


Standardization complete. The standardized data has been saved to 'Standardized_Dataset.csv' in the 'Datasets/CNN' directory.


In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Function to perform Linear Regression with Cross-Validation
def perform_linear_regression_cv(df):
    X = df.drop(columns=['Price'])  # Features
    y = df['Price']  # Target variable

    # Linear Regression with Cross-Validation
    linear_reg = LinearRegression()
    mse_scores = -cross_val_score(linear_reg, X, y, cv=5, scoring='neg_mean_squared_error')
    return mse_scores.mean()

# Function to perform XGBoost Regression with Cross-Validation
def perform_xgboost_regression_cv(df):
    X = df.drop(columns=['Price'])  # Features
    y = df['Price']  # Target variable

    # Best parameters for XGBoost Regression
    best_params = {
        'learning_rate': 0.1,
        'max_depth': 3,
        'n_estimators': 100,
        'subsample': 0.8
    }

    # XGBoost Regression with Cross-Validation
    xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', **best_params)
    mse_scores = -cross_val_score(xgb_reg, X, y, cv=5, scoring='neg_mean_squared_error')
    return mse_scores.mean()

# Function to perform Random Forest Regression with Cross-Validation
def perform_random_forest_regression_cv(df):
    X = df.drop(columns=['Price'])  # Features
    y = df['Price']  # Target variable

    # Random Forest Regression with Cross-Validation
    rf_reg = RandomForestRegressor(random_state=42)
    mse_scores = -cross_val_score(rf_reg, X, y, cv=5, scoring='neg_mean_squared_error')
    return mse_scores.mean()

# Function to perform Neural Network Regression with Cross-Validation
def perform_neural_network_regression_cv(df):
    X = df.drop(columns=['Price'])  # Features
    y = df['Price']  # Target variable
    
    # Define the neural network architecture
    model = keras.Sequential([
        layers.Dense(64, activation='relu', input_shape=[X.shape[1]]),
        layers.Dense(64, activation='relu'),
        layers.Dense(1)
    ])
    
    # Compile the model
    model.compile(optimizer='adam', loss='mean_squared_error')
    
    # Define K-fold cross-validation
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    mse_scores = []
    
    # Perform K-fold cross-validation
    for train_idx, val_idx in kfold.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        # Train the model
        model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)
        
        # Evaluate the model on the validation set
        val_loss = model.evaluate(X_val, y_val, verbose=0)
        mse_scores.append(val_loss)
    
    return np.mean(mse_scores)

# Folder containing individual CSV files
folder_path = "Datasets/CNN"

# Function to perform data preprocessing and feature engineering
from category_encoders import TargetEncoder

# Function to preprocess data with enhanced feature engineering and encoding
def preprocess_data(df):

    # Drop original 'Date' and 'Quarter_Date' columns
    df.drop(columns=['CarName'], inplace=True)

    # Use target encoding for the 'Stock' column
    encoder = TargetEncoder()
    df['Stock_Encoded'] = encoder.fit_transform(df['Stock'], df['Price'])

    # Drop the original 'Stock' column
    df.drop(columns=['Stock'], inplace=True)
    
    return df

# Iterate over CSV files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        file_path = os.path.join(folder_path, filename)
        print("Processing file:", filename)

        df = pd.read_csv(file_path)
        
        # Preprocess data
        df = preprocess_data(df)
        
        # Assuming 'Price' is the target column
        if 'Price' in df.columns:
            # Perform Linear Regression with Cross-Validation
            linear_reg_cv_mse = perform_linear_regression_cv(df)
            print("Cross-Validation MSE (Linear Regression):", linear_reg_cv_mse)
            
            # Perform XGBoost Regression with Cross-Validation
            xgb_reg_cv_mse = perform_xgboost_regression_cv(df)
            print("Cross-Validation MSE (XGBoost Regression):", xgb_reg_cv_mse)
            
            # Perform Random Forest Regression with Cross-Validation
            rf_reg_cv_mse = perform_random_forest_regression_cv(df)
            print("Cross-Validation MSE (Random Forest Regression):", rf_reg_cv_mse)
            
            # Perform Neural Network Regression with Cross-Validation
            nn_reg_cv_mse = perform_neural_network_regression_cv(df)
            print("Cross-Validation MSE (Neural Network Regression):", nn_reg_cv_mse)
        else:
            print("Error: 'Price' column not found in", filename)


Processing file: CNN_car_featuresPCAstand.csv
Cross-Validation MSE (Linear Regression): 2.204167654192342
Cross-Validation MSE (XGBoost Regression): 0.46913157372044045
Cross-Validation MSE (Random Forest Regression): 0.3734591126003731


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Cross-Validation MSE (Neural Network Regression): 0.24715637173503638
