# 3. Vision Transformer ViT


In [5]:
import torch
import os
import csv
from PIL import Image
from torchvision.transforms import functional as F
from transformers import ViTFeatureExtractor, ViTModel
import pandas as pd

def encode_image(image_path, feature_extractor, model):
    image = Image.open(image_path)
    image = F.resize(image, (224, 224))  
    inputs = feature_extractor(images=image, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    image_encoding = outputs.last_hidden_state[:, 0, :]
    return image_encoding.flatten().tolist()

def encode_images_in_folder(input_folder, output_csv):
    # Load pre-trained ViT model and feature extractor
    feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
    model = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
    model.eval()

    # Get list of all image files in input folder
    image_files = [f for f in os.listdir(input_folder) if f.endswith(".jpg") or f.endswith(".png")]

    # Encode images and collect encodings
    image_encodings = []
    for image_file in image_files:
        print(image_file)
        image_path = os.path.join(input_folder, image_file)
        encoding = encode_image(image_path, feature_extractor, model)
        image_encodings.append([image_file] + encoding)

    # Transpose the list so that each feature occupies a separate column
    transposed_encodings = list(zip(*image_encodings))

    # Write encodings to CSV
    with open(output_csv, 'w', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)
        # Write header
        csv_writer.writerow(['CarName'] + [f'Feature_{i}' for i in range(len(transposed_encodings) - 1)])
        # Write rows
        for row in zip(*transposed_encodings):
            csv_writer.writerow(row)

# Example usage:
input_folder = "images"
output_csv = "Datasets/ViT_car_features.csv"
encode_images_in_folder(input_folder, output_csv)
print("done")

# Read the first CSV file
df1 = pd.read_csv("Datasets/ViT_car_features.csv")

# Read the second CSV file
df2 = pd.read_csv('Datasets/raw/CarsWithStocksPrices.csv')

# Merge the dataframes based on the "Full Name" column
merged_df = pd.merge(df1, df2, on="CarName", how="left")

# Save the merged dataframe to a new CSV file
merged_df.to_csv("Datasets/ViT_car_features.csv", index=False)


BMW 1 Series 2019.jpg
BMW 2 Series Active Tourer 2022.jpg
BMW 2 Series Gran Coupe 2020.jpg
BMW 3 Series 2014.jpg
BMW 4 Series 2015.jpg
BMW 4 Series Convertible 2021.jpg
BMW 5 Series 2017.jpg
BMW 6 Series 2017.jpg
BMW 7 Series 2015.jpg
BMW 7 Series 2023.jpg
BMW 8 Series 2018.jpg
BMW i3 2016.jpg
BMW i4 2022.jpg
BMW i7 2023.jpg
BMW i8 2014.jpg
BMW iX 2021.jpg
BMW iX3 2020.jpg
BMW M2 2016.jpg
BMW M8 2019.jpg
BMW X1 2016.jpg
BMW X2 2018.jpg
BMW X3 2017.jpg
BMW X3 M 2022.jpg
BMW X4 2021.jpg
BMW X5 2014.jpg
BMW X5 M 2020.jpg
BMW X6 2015.jpg
BMW X7 2019.jpg
BMW XM 2023.jpg
BMW Z4 2018.jpg
Buick Enclave 2017.jpg
Buick Encore 2017.jpg
Buick Envision 2023.jpg
BYD Dolphin 2023.jpg
BYD e3 2019.jpg
BYD e3 DM-i 2021.jpg
BYD e5 2018.jpg
BYD e6 2018.jpg
BYD Han 2020.jpg
BYD Qin 2014.jpg
BYD Qin Plus 2022.jpg
BYD Qin Pro 2022.jpg
BYD Song 2016.jpg
BYD Song Max 2019.jpg
BYD Song Plus 2023.jpg
BYD Song Pro 2020.jpg
BYD Tang 2015.jpg
BYD Tang DM-i 2021.jpg
BYD Yuan 2017.jpg
Cadillac CT5 2021.jpg
Cadillac E

NameError: name 'pd' is not defined

In [8]:
# Read the first CSV file
df1 = pd.read_csv("Datasets/ViT_car_features.csv")

# Read the second CSV file
df2 = pd.read_csv('Datasets/raw/CarsWithStocksPrices.csv')

# Merge the dataframes based on the "Full Name" column
merged_df = pd.merge(df1, df2, on="CarName", how="left")

# Save the merged dataframe to a new CSV file
merged_df.to_csv("Datasets/ViT_car_features.csv", index=False)


# Predict Stock price using HOG Features


First apply PCA to reduce dimension

In [15]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

# Load the dataset
input_csv = 'Datasets/ViT_car_features.csv'
output_csv = 'Datasets/ViT_car_featuresPCA.csv'

print("Loading dataset...")
df = pd.read_csv(input_csv)
print(f"Dataset loaded with shape: {df.shape}")

# Separating out the features for PCA (starting from the 5th column to the end)
X = df.iloc[:, 3:].values

# Perform PCA
n_components = 199  # Adjust the number of components based on your needs
print(f"Performing PCA to reduce dimensions to {n_components}...")
pca = PCA(n_components=n_components)
principalComponents = pca.fit_transform(X)

# Creating a DataFrame with the first 5 columns and the principal components
print("Creating DataFrame with the first 5 columns and principal components...")
principalDf = pd.DataFrame(data=principalComponents, columns=[f'PC{i+1}' for i in range(n_components)])

# Combine the first 5 columns with the principal components
result_df = pd.concat([df.iloc[:, :3], principalDf], axis=1)

# Saving the new dataset to a CSV file
print(f"Saving the new dataset to {output_csv}...")
result_df.to_csv(output_csv, index=False)
print("Process completed successfully.")


Loading dataset...
Dataset loaded with shape: (397, 771)
Performing PCA to reduce dimensions to 199...
Creating DataFrame with the first 5 columns and principal components...
Saving the new dataset to Datasets/ViT_car_featuresPCA.csv...
Process completed successfully.


Predict Stock price using Machine learning model

In [16]:
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the CSV file into a DataFrame
df = pd.read_csv('Datasets/ViT_car_featuresPCA.csv')

# Identify the numerical columns
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns

# Standardize all numerical columns
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Specify the directory to save the file
output_directory = 'Datasets/ViT'
os.makedirs(output_directory, exist_ok=True)  # Create the directory if it doesn't exist

# Save the standardized DataFrame to a new CSV file
output_file = os.path.join(output_directory, 'ViT_car_featuresPCAstand.csv')
df.to_csv(output_file, index=False)

print("Standardization complete. The standardized data has been saved to 'Standardized_Dataset.csv' in the '{}' directory.".format(output_directory))


Standardization complete. The standardized data has been saved to 'Standardized_Dataset.csv' in the 'Datasets/ViT' directory.


In [7]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Function to perform Linear Regression with Cross-Validation
def perform_linear_regression_cv(df):
    X = df.drop(columns=['Price'])  # Features
    y = df['Price']  # Target variable

    # Linear Regression with Cross-Validation
    linear_reg = LinearRegression()
    mse_scores = -cross_val_score(linear_reg, X, y, cv=5, scoring='neg_mean_squared_error')
    return mse_scores.mean()

# Function to perform XGBoost Regression with Cross-Validation
def perform_xgboost_regression_cv(df):
    X = df.drop(columns=['Price'])  # Features
    y = df['Price']  # Target variable

    # Best parameters for XGBoost Regression
    best_params = {
        'learning_rate': 0.1,
        'max_depth': 3,
        'n_estimators': 100,
        'subsample': 0.8
    }

    # XGBoost Regression with Cross-Validation
    xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', **best_params)
    mse_scores = -cross_val_score(xgb_reg, X, y, cv=5, scoring='neg_mean_squared_error')
    return mse_scores.mean()

# Function to perform Random Forest Regression with Cross-Validation
def perform_random_forest_regression_cv(df):
    X = df.drop(columns=['Price'])  # Features
    y = df['Price']  # Target variable

    # Random Forest Regression with Cross-Validation
    rf_reg = RandomForestRegressor(random_state=42)
    mse_scores = -cross_val_score(rf_reg, X, y, cv=5, scoring='neg_mean_squared_error')
    return mse_scores.mean()

# Function to perform Neural Network Regression with Cross-Validation
def perform_neural_network_regression_cv(df):
    X = df.drop(columns=['Price'])  # Features
    y = df['Price']  # Target variable
    
    # Define the neural network architecture
    model = keras.Sequential([
        layers.Dense(64, activation='relu', input_shape=[X.shape[1]]),
        layers.Dense(64, activation='relu'),
        layers.Dense(1)
    ])
    
    # Compile the model
    model.compile(optimizer='adam', loss='mean_squared_error')
    
    # Define K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    mse_scores = []
    
    # Perform K-fold cross-validation
    for train_idx, val_idx in kfold.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        # Train the model
        model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)
        
        # Evaluate the model on the validation set
        val_loss = model.evaluate(X_val, y_val, verbose=0)
        mse_scores.append(val_loss)
    
    return np.mean(mse_scores)

# Folder containing individual CSV files
folder_path = "Datasets/ViT/test"

# Function to perform data preprocessing and feature engineering
from category_encoders import TargetEncoder

# Function to preprocess data with enhanced feature engineering and encoding
def preprocess_data(df):
       
    # Drop original 'Date' and 'Quarter_Date' columns
    df.drop(columns=['CarName'], inplace=True)

    # Use target encoding for the 'Stock' column
    encoder = TargetEncoder()
    df['Stock_Encoded'] = encoder.fit_transform(df['Stock'], df['Price'])

    # Drop the original 'Stock' column
    df.drop(columns=['Stock'], inplace=True)
    
    return df

# Iterate over CSV files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        file_path = os.path.join(folder_path, filename)
        print("Processing file:", filename)

        df = pd.read_csv(file_path)
        
        # Preprocess data
        df = preprocess_data(df)
        
        # Assuming 'Price' is the target column
        if 'Price' in df.columns:
            # Perform Linear Regression with Cross-Validation
            linear_reg_cv_mse = perform_linear_regression_cv(df)
            print("Cross-Validation MSE (Linear Regression):", linear_reg_cv_mse)
            
            # Perform XGBoost Regression with Cross-Validation
            xgb_reg_cv_mse = perform_xgboost_regression_cv(df)
            print("Cross-Validation MSE (XGBoost Regression):", xgb_reg_cv_mse)
            
            # Perform Random Forest Regression with Cross-Validation
            rf_reg_cv_mse = perform_random_forest_regression_cv(df)
            print("Cross-Validation MSE (Random Forest Regression):", rf_reg_cv_mse)
            
            # Perform Neural Network Regression with Cross-Validation
            nn_reg_cv_mse = perform_neural_network_regression_cv(df)
            print("Cross-Validation MSE (Neural Network Regression):", nn_reg_cv_mse)
        else:
            print("Error: 'Price' column not found in", filename)


Processing file: ViT_car_featuresPCAstand.csv
Cross-Validation MSE (Linear Regression): 1.4709974842479585
Cross-Validation MSE (XGBoost Regression): 0.8466982810229509
Cross-Validation MSE (Random Forest Regression): 0.8800910970506587


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Cross-Validation MSE (Neural Network Regression): 0.17451017184648662


In [None]:
Processing file: ViT_car_featuresPCAstand.csv
Cross-Validation MSE (Linear Regression): 1.4709974842479585
Cross-Validation MSE (XGBoost Regression): 0.8466982810229509
Cross-Validation MSE (Random Forest Regression): 0.8800910970506587
D:\Python\Python311\Lib\site-packages\keras\src\layers\core\dense.py:87: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
Cross-Validation MSE (Neural Network Regression): 0.2712056694552302

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers



# Function to perform Random Forest Regression with Cross-Validation
def perform_random_forest_regression_cv(df):
    X = df.drop(columns=['Price'])  # Features
    y = df['Price']  # Target variable

    # Random Forest Regression with Cross-Validation
    rf_reg = RandomForestRegressor(random_state=42)
    mse_scores = -cross_val_score(rf_reg, X, y, cv=5, scoring='neg_mean_squared_error')
    return mse_scores.mean()



# Folder containing individual CSV files
folder_path = "Datasets/ViT"

# Function to perform data preprocessing and feature engineering
from category_encoders import TargetEncoder

# Function to preprocess data with enhanced feature engineering and encoding
def preprocess_data(df):
       
    # Drop original 'Date' and 'Quarter_Date' columns
    df.drop(columns=['CarName'], inplace=True)

    # Use target encoding for the 'Stock' column
    encoder = TargetEncoder()
    df['Stock_Encoded'] = encoder.fit_transform(df['Stock'], df['Price'])

    # Drop the original 'Stock' column
    df.drop(columns=['Stock'], inplace=True)
    
    return df

# Iterate over CSV files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        file_path = os.path.join(folder_path, filename)
        print("Processing file:", filename)

        df = pd.read_csv(file_path)
        
        # Preprocess data
        df = preprocess_data(df)
        
        # Assuming 'Price' is the target column
        if 'Price' in df.columns:

            
            # Perform Random Forest Regression with Cross-Validation
            rf_reg_cv_mse = perform_random_forest_regression_cv(df)
            print("Cross-Validation MSE (Random Forest Regression):", rf_reg_cv_mse)

            print("Cross-Validation MSE (Neural Network Regression):", nn_reg_cv_mse)
        else:
            print("Error: 'Price' column not found in", filename)


In [18]:
# Function to perform Random Forest Regression with Cross-Validation and Grid Search
from sklearn.model_selection import GridSearchCV

def perform_random_forest_regression_cv_with_grid_search(df):
    X = df.drop(columns=['Price'])  # Features
    y = df['Price']  # Target variable

    # Define the parameter grid
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    }

    # Random Forest Regression with Cross-Validation and Grid Search
    rf_reg = RandomForestRegressor(random_state=42)
    grid_search = GridSearchCV(estimator=rf_reg, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X, y)

    # Print the best parameters found
    print("Best parameters found:", grid_search.best_params_)

    # Print the best mean squared error found
    print("Best mean squared error found:", -grid_search.best_score_)

    return -grid_search.best_score_

# Iterate over CSV files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        file_path = os.path.join(folder_path, filename)
        print("Processing file:", filename)

        df = pd.read_csv(file_path)
        
        # Preprocess data
        df = preprocess_data(df)
        
        # Assuming 'Price' is the target column
        if 'Price' in df.columns:
            # Perform Random Forest Regression with Cross-Validation and Grid Search
            rf_reg_cv_mse = perform_random_forest_regression_cv_with_grid_search(df)
            print("Cross-Validation MSE (Random Forest Regression with Grid Search):", rf_reg_cv_mse)
        else:
            print("Error: 'Price' column not found in", filename)


Processing file: ViT_car_featuresPCAstand.csv
Best parameters found: {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
Best mean squared error found: 0.8765726919555634
Cross-Validation MSE (Random Forest Regression with Grid Search): 0.8765726919555634


In [3]:
import pandas as pd

# Load the datasets
dataset2 = pd.read_csv('Datasets/ViT/ViT_car_featuresPCAstand.csv')
dataset1 = pd.read_csv('Datasets/raw/CarsWithStocksPrices.csv')

# Extract 'CarName' and 'Date' columns from dataset1
date_info = dataset1[['CarName', 'Date']]

# Merge the date_info with dataset2 based on 'CarName'
dataset2_with_date = pd.merge(dataset2, date_info, on='CarName', how='left')

dataset2_with_date = dataset2_with_date.drop_duplicates(subset='CarName')

# Save the new dataset to a CSV file
dataset2_with_date.to_csv('dataset2_with_date.csv', index=False)

print("New dataset saved as 'dataset2_with_date.csv'")

New dataset saved as 'dataset2_with_date.csv'
