In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from category_encoders import LeaveOneOutEncoder
from sklearn.impute import SimpleImputer

import joblib
import pickle



# Load data
data = pd.read_csv("E:/Licenta/AIModel/AIModel/car_price_prediction.csv")

# Check for duplicate rows and remove them
data = data.drop_duplicates()

# Transform column names to snake case
data.columns = data.columns.str.lower().str.replace(' ', '_').str.replace('-', '_')

# Process 'engine_volume' and create 'is_turbo' feature
data['engine_volume'] = data['engine_volume'].astype(str)
data['is_turbo'] = data['engine_volume'].apply(lambda x: 1 if 'Turbo' in x else 0)
data['engine_volume'] = data['engine_volume'].str.replace(' Turbo', '').astype(float)

# Drop unnecessary columns
data = data.drop(['id'], axis=1)
data = data.drop(['levy'], axis=1)

# Handle NaN and infinity values
data.replace([np.inf, -np.inf], np.nan, inplace=True)

# Remove rows with NaN values
data = data.dropna()

# Process 'mileage'
data['mileage'] = data['mileage'].str.replace(' km', '').astype(int)

# Map 'leather_interior' to binary values
data['leather_interior'] = data['leather_interior'].map({'Yes': 1, 'No': 0})

# Ensure all values in 'prod._year' and 'mileage' are positive
data = data[(data['prod._year'] > 0) & (data['mileage'] > 0)]

data['prod._year'] = np.log(data['prod._year'] + 1)
data['mileage'] = np.log(data['mileage'] + 1)

# Encode categorical features
categorical_columns = ['manufacturer', 'model', 'category', 'fuel_type', 'gear_box_type', 'drive_wheels', 'doors', 'wheel', 'color']
encoder = LeaveOneOutEncoder(cols=categorical_columns)
data = encoder.fit_transform(data, data['price'])

# Split data into features and target variable
x = data.drop(['price'], axis=1)
y = data['price']

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Check for NaN values
print("NaN values in x_train:\n", x_train.isnull().sum())
print("NaN values in x_test:\n", x_test.isnull().sum())

# Impute missing values (if any) in x_train and x_test
imputer = SimpleImputer(strategy='mean')
x_train_s = imputer.fit_transform(x_train)
x_test_s = imputer.transform(x_test)

# Define models
reg = LinearRegression()
forest = RandomForestRegressor(n_estimators=150)
bagging = BaggingRegressor()

models = {'RandomForest': forest, 'Bagging': bagging, 'LinearRegression': reg}

# Results dictionary
results = {'Model': [], 'MAE (Train)': [], 'MAE (Test)': [], 'R-squared (Train)': [], 'R-squared (Test)': []}

# Loop through models
for model_name, model in models.items():
    # Train the model on the scaled training data
    model.fit(x_train_s, y_train)

    # # Save the trained model
    # with open(f'{model_name}_model.pkl', 'wb') as file:
    #     pickle.dump(model, file)
    joblib.dump(model, f'{model_name}_model.joblib')

    # Predictions on the scaled training set
    y_train_pred = model.predict(x_train_s)

    # Predictions on the scaled evaluation set
    y_test_pred = model.predict(x_test_s)

    # Calculate MAE and R-squared for training and evaluation sets
    mae_train = mean_absolute_error(y_train, y_train_pred)
    mae_test = mean_absolute_error(y_test, y_test_pred)
    r2_train = r2_score(y_train, y_train_pred)
    r2_test = r2_score(y_test, y_test_pred)

    # Append results to the dictionary
    results['Model'].append(model_name)
    results['MAE (Train)'].append(mae_train)
    results['MAE (Test)'].append(mae_test)
    results['R-squared (Train)'].append(round(r2_train, 3))
    results['R-squared (Test)'].append(round(r2_test, 3))


    # Print the first 10 predicted price vs actual price for each row in the test set
    print(f"\n{model_name} Model Predictions:")
    print("Predicted Price vs Actual Price (First 10)")
    for i, (actual, predicted) in enumerate(zip(y_test, y_test_pred)):
        print(f"Actual: {actual}, Predicted: {predicted}")
        if i == 9:  # Stop after printing 10 pairs
            break

# Save the encoder and training columns
joblib.dump(encoder, 'encoder.joblib')
joblib.dump(x_train.columns, 'train_columns.joblib')

# Create DataFrame from results dictionary
results_df = pd.DataFrame(results)

# Display the results
print(results_df)


  result = getattr(ufunc, method)(*inputs, **kwargs)


NaN values in x_train:
 manufacturer        0
model               0
prod._year          0
category            0
leather_interior    0
fuel_type           0
engine_volume       0
mileage             6
cylinders           0
gear_box_type       0
drive_wheels        0
doors               0
wheel               0
color               0
airbags             0
is_turbo            0
dtype: int64
NaN values in x_test:
 manufacturer        0
model               0
prod._year          0
category            0
leather_interior    0
fuel_type           0
engine_volume       0
mileage             1
cylinders           0
gear_box_type       0
drive_wheels        0
doors               0
wheel               0
color               0
airbags             0
is_turbo            0
dtype: int64

RandomForest Model Predictions:
Predicted Price vs Actual Price (First 10)
Actual: 1400, Predicted: 1396.46
Actual: 1019, Predicted: 1019.0
Actual: 34497, Predicted: 34497.0
Actual: 1019, Predicted: 1019.0
Actual: 13485, P

In [None]:
#------------------------------------------------------------------------------
#BEST APROXIMATION
forest = RandomForestRegressor()
param_grid = {
    "n_estimators": [150, 200],
    
}

grid_search = GridSearchCV(forest, param_grid, cv=5,
                           scoring="neg_mean_squared_error",
                           return_train_score=True)
grid_search.fit(x_train, y_train)

best_forest = grid_search.best_estimator_
print("Best Estimator:", best_forest)
best_forest.score(x_test, y_test)

In [18]:
# Define the car instance
# car_instance = pd.DataFrame({
#     'manufacturer': ['LEXUS'],
#     'model': ['GX 460'],
#     'prod._year': [2015],
#     'category': ['Jeep'],
#     'leather_interior': ['Yes'],
#     'fuel_type': ['Petrol'],
#     'engine_volume': ['4.6'],
#     'mileage': ['102907 km'],
#     'cylinders': [8.0],
#     'gear_box_type': ['Automatic'],
#     'drive_wheels': ['4x4'],
#     'doors': ['4-May'],
#     'wheel': ['Left wheel'],
#     'color': ['Black'],
#     'airbags':[0.0]
# })
import pandas as pd
import numpy as np
import joblib

car_instance = pd.DataFrame({
    'manufacturer': ['FORD'],
    'model': ['Explorer'],
    'prod._year': [2020],
    'category': ['Sedan'],
    'leather_interior': ['Yes'],
    'fuel_type': ['Hybrid'],
    'engine_volume': ['3.0'],
    'mileage': ['1515860 km'],
    'cylinders': [6.0],
    'gear_box_type': ['Automatic'],
    'drive_wheels': ['Rear'],
    'doors': ['4-May'],
    'wheel': ['Left wheel'],
    'color': ['Silver'],
    'airbags':[0.0]
})

# Preprocess the specific car instance
car_instance.columns = car_instance.columns.str.lower().str.replace(' ', '_').str.replace('-', '_')
car_instance['leather_interior'] = car_instance['leather_interior'].map({'Yes': 1, 'No': 0})
car_instance['mileage'] = car_instance['mileage'].str.replace(' km', '').astype(int)

car_instance['engine_volume'] = car_instance['engine_volume'].astype(str)
car_instance['is_turbo'] = car_instance['engine_volume'].apply(lambda x: 1 if 'Turbo' in x else 0)
car_instance['engine_volume'] = car_instance['engine_volume'].str.replace(' Turbo', '').astype(float)

# Apply logarithmic transformation to the car instance
car_instance['prod._year'] = np.log(car_instance['prod._year'] + 1)
car_instance['mileage'] = np.log(car_instance['mileage'] + 1)

# Temporarily add the 'price' column to align with the encoder's expected input dimensions
car_instance['price'] =0

# Load the encoder that was fitted on the training data
encoder = joblib.load('encoder.joblib')

# Apply LeaveOneOut encoding to categorical columns using the same encoder as the training data
categorical_columns = ['manufacturer', 'model', 'category', 'fuel_type', 'gear_box_type', 'drive_wheels', 'doors', 'wheel', 'color']
car_instance_encoded = encoder.transform(car_instance)

# Drop the temporary 'price' column
car_instance_encoded = car_instance_encoded.drop(columns=['price'])

# # Ensure that we are not duplicating any columns
# encoded_columns = [col for col in car_instance_encoded.columns if col not in car_instance.columns]
# car_instance = car_instance.drop(columns=categorical_columns)
# car_instance = pd.concat([car_instance, car_instance_encoded[encoded_columns]], axis=1)

# # Ensure all columns match the training data
# x_train_columns = joblib.load('train_columns.joblib')  # Get the columns from the training data
# car_instance_encoded = car_instance_encoded.reindex(columns=x_train_columns, fill_value=0)

# Load the scaler and scale the features using the same scaler as the training data
# scaler = joblib.load('scaler.joblib')
car_instance_scaled = car_instance_encoded #scaler.transform(car_instance_encoded)

print(car_instance_scaled);

# Load the best estimator model
#forest = pickle.load(open('RandomForest_model.pkl', 'rb'))
forest = joblib.load('RandomForest_model.joblib')
bagging = joblib.load('Bagging_model.joblib')
# linear = joblib.load('LinearRegression_model.joblib')

# Predict the price using the best estimator
predicted_price_forest = forest.predict(car_instance_scaled)
# predicted_price_bagging = bagging.predict(car_instance_scaled)
# predicted_price_linear = linear.predict(car_instance_scaled)
print(f"Predicted Price Forest: {predicted_price_forest[0]}")
print(f"Predicted Price Bagging: {predicted_price_bagging[0]}")
# print(f"Predicted Price Linear: {predicted_price_linear[0]}")

   manufacturer         model  prod._year      category  leather_interior  \
0  15756.086789  64797.219512    7.611348  14462.736422                 1   

      fuel_type  engine_volume    mileage  cylinders  gear_box_type  \
0  10944.138226            3.0  14.231494        6.0   16293.358366   

   drive_wheels        doors         wheel         color  airbags  is_turbo  
0  17742.433632  18927.68391  19717.067758  15513.357523      0.0         0  
Predicted Price Forest: 19234.7
Predicted Price Bagging: 19636.0




In [8]:

import pandas as pd
import numpy as np
import joblib


car_instance = pd.DataFrame({
    'manufacturer': ['LEXUS'],
    'model': ['GX 460'],
    'prod._year': [2010],
    'category': ['Jeep'],
    'leather_interior': ['Yes'],
    'fuel_type': ['Petrol'],
    'engine_volume': ['4.6'],
    'mileage': ['275240 km'],
    'cylinders': [8.0],
    'gear_box_type': ['Automatic'],
    'drive_wheels': ['4x4'],
    'doors': ['4-May'],
    'wheel': ['Left wheel'],
    'color': ['Silver'],
    'airbags':[0.0]
})

# Preprocess the specific car instance
car_instance.columns = car_instance.columns.str.lower().str.replace(' ', '_').str.replace('-', '_')
car_instance['leather_interior'] = car_instance['leather_interior'].map({'Yes': 1, 'No': 0})
car_instance['mileage'] = car_instance['mileage'].str.replace(' km', '').astype(int)

car_instance['engine_volume'] = car_instance['engine_volume'].astype(str)
car_instance['is_turbo'] = car_instance['engine_volume'].apply(lambda x: 1 if 'Turbo' in x else 0)
car_instance['engine_volume'] = car_instance['engine_volume'].str.replace(' Turbo', '').astype(float)

# Apply logarithmic transformation to the car instance
car_instance['prod._year'] = np.log(car_instance['prod._year'] + 1)
car_instance['mileage'] = np.log(car_instance['mileage'] + 1)

# Temporarily add the 'price' column to align with the encoder's expected input dimensions
car_instance['price'] =0

# Load the encoder that was fitted on the training data
encoder = joblib.load('encoder.joblib')

# Apply LeaveOneOut encoding to categorical columns using the same encoder as the training data
categorical_columns = ['manufacturer', 'model', 'category', 'fuel_type', 'gear_box_type', 'drive_wheels', 'doors', 'wheel', 'color']
car_instance_encoded = encoder.transform(car_instance)

# Drop the temporary 'price' column
car_instance_encoded = car_instance_encoded.drop(columns=['price'])

# # Ensure that we are not duplicating any columns
# encoded_columns = [col for col in car_instance_encoded.columns if col not in car_instance.columns]
# car_instance = car_instance.drop(columns=categorical_columns)
# car_instance = pd.concat([car_instance, car_instance_encoded[encoded_columns]], axis=1)

# # Ensure all columns match the training data
# x_train_columns = joblib.load('train_columns.joblib')  # Get the columns from the training data
# car_instance_encoded = car_instance_encoded.reindex(columns=x_train_columns, fill_value=0)

# Load the scaler and scale the features using the same scaler as the training data
# scaler = joblib.load('scaler.joblib')
car_instance_scaled = car_instance_encoded #scaler.transform(car_instance_encoded)

print(car_instance_scaled);

# Load the best estimator model
#forest = pickle.load(open('RandomForest_model.pkl', 'rb'))
forest = joblib.load('RandomForest_model.joblib')
bagging = joblib.load('Bagging_model.joblib')
# linear = joblib.load('LinearRegression_model.joblib')

# Predict the price using the best estimator
predicted_price_forest = forest.predict(car_instance_scaled)
predicted_price_bagging = bagging.predict(car_instance_scaled)
# predicted_price_linear = linear.predict(car_instance_scaled)
print(f"Predicted Price Forest: {predicted_price_forest[0]}")
print(f"Predicted Price Bagging: {predicted_price_bagging[0]}")
# print(f"Predicted Price Linear: {predicted_price_linear[0]}")

   manufacturer         model  prod._year      category  leather_interior  \
0  20353.821549  27830.333333    7.606387  23957.405742                 1   

      fuel_type  engine_volume    mileage  cylinders  gear_box_type  \
0  17682.247119            4.6  12.525402        8.0   16293.358366   

   drive_wheels        doors         wheel         color  airbags  is_turbo  
0  19699.168985  18927.68391  19717.067758  15513.357523      0.0         0  
Predicted Price Forest: 19174.453333333335
Predicted Price Bagging: 19636.0




In [None]:
y_test_score = forest.score(x_test_s,y_test)

In [None]:
y_test_score


In [None]:
# Display the 140th row from the DataFrame 'data'
row_140 = data.iloc[104]  # Indexing starts from 0, so the 140th row is at index 139
print(row_140)


In [None]:
car_instance

In [None]:
data


In [4]:
data

Unnamed: 0,price,manufacturer,model,prod._year,category,leather_interior,fuel_type,engine_volume,mileage,cylinders,gear_box_type,drive_wheels,doors,wheel,color,airbags,is_turbo
0,13328,20361.715730,9234.750000,7.606387,23959.440467,1,10943.435436,3.5,12.133534,6.0,16293.589151,19700.841207,17472.767137,19717.446864,15513.970011,12,0
1,16621,15005.221359,7349.722222,7.606885,23958.810107,0,17682.358303,3.0,12.165256,6.0,26301.014711,19699.976903,17472.577612,19717.251469,19056.173137,8,0
2,8467,14789.403333,12011.632099,7.604396,11631.998500,0,17683.212572,1.3,12.206078,4.0,14721.324401,18893.444508,17473.046906,9116.750554,19057.884365,2,0
3,3607,15767.813707,8022.142857,7.606885,23961.301302,1,10946.301297,2.5,12.037459,4.0,16294.345708,19703.392651,17473.326619,19718.023675,18436.073588,0,0
4,11726,14785.782222,12003.585185,7.608374,11630.776528,1,17682.871137,1.3,11.428478,4.0,16293.713830,18893.179139,17472.859338,19717.541921,15514.419002,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19232,8467,18611.914905,7682.166667,7.600902,21596.086275,1,8393.520681,2.0,12.611541,4.0,27861.291491,17746.817108,53811.185754,19717.735299,15515.332399,5,1
19233,15681,22496.198807,14593.968300,7.606885,14462.589039,1,17682.456784,2.4,11.992886,4.0,26301.328987,18892.857096,17472.631712,19717.307245,14128.558419,8,0
19234,26108,22493.370762,30405.453461,7.606387,23956.994066,1,31175.277507,2.0,11.664496,4.0,16292.594521,18892.008061,17472.031597,19716.688542,19452.873239,4,0
19235,5331,15016.182524,17842.057554,7.604894,23960.971286,1,31180.606309,2.0,10.844646,4.0,16294.211534,18893.699862,17473.227396,19717.921379,19058.542497,4,0


In [None]:
# Check for missing values
print(data.isnull().sum())


In [None]:
data.dropna()

In [45]:
import pandas as pd
import numpy as np
import joblib

car_instance = pd.DataFrame({
    'manufacturer': ['LEXUS'],
    'model': ['RX 450'],
    'prod._year': [2010],
    'category': ['Jeep'],
    'leather_interior': ['Yes'],
    'fuel_type': ['Hybrid'],
    'engine_volume': ['3.5'],
    'mileage': ['186005 km'],
    'cylinders': [6.0],
    'gear_box_type': ['Automatic'],
    'drive_wheels': ['4x4'],
    'doors': ['4-May'],
    'wheel': ['Left wheel'],
    'color': ['Silver'],
    'airbags':[12]
})



# Preprocess the specific car instance
car_instance.columns = car_instance.columns.str.lower().str.replace(' ', '_').str.replace('-', '_')
car_instance['leather_interior'] = car_instance['leather_interior'].map({'Yes': 1, 'No': 0})
car_instance['mileage'] = car_instance['mileage'].str.replace(' km', '').astype(int)

car_instance['engine_volume'] = car_instance['engine_volume'].astype(str)
car_instance['is_turbo'] = car_instance['engine_volume'].apply(lambda x: 1 if 'Turbo' in x else 0)
car_instance['engine_volume'] = car_instance['engine_volume'].str.replace(' Turbo', '').astype(float)

# Apply logarithmic transformation to the car instance
car_instance['prod._year'] = np.log(car_instance['prod._year'] + 1)
car_instance['mileage'] = np.log(car_instance['mileage'] + 1)

# Temporarily add the 'price' column to align with the encoder's expected input dimensions
car_instance['price'] =0

# Load the encoder that was fitted on the training data
encoder = joblib.load('encoder.joblib')

# Apply LeaveOneOut encoding to categorical columns using the same encoder as the training data
categorical_columns = ['manufacturer', 'model', 'category', 'fuel_type', 'gear_box_type', 'drive_wheels', 'doors', 'wheel', 'color']
car_instance_encoded = encoder.transform(car_instance)

# Drop the temporary 'price' column
car_instance_encoded = car_instance_encoded.drop(columns=['price'])

# # Ensure that we are not duplicating any columns
# encoded_columns = [col for col in car_instance_encoded.columns if col not in car_instance.columns]
# car_instance = car_instance.drop(columns=categorical_columns)
# car_instance = pd.concat([car_instance, car_instance_encoded[encoded_columns]], axis=1)

# # Ensure all columns match the training data
# x_train_columns = joblib.load('train_columns.joblib')  # Get the columns from the training data
# car_instance_encoded = car_instance_encoded.reindex(columns=x_train_columns, fill_value=0)

# Load the scaler and scale the features using the same scaler as the training data
# scaler = joblib.load('scaler.joblib')
car_instance_scaled = car_instance_encoded #scaler.transform(car_instance_encoded)

print(car_instance_scaled);

# Load the best estimator model
#forest = pickle.load(open('RandomForest_model.pkl', 'rb'))
forest = joblib.load('RandomForest_model.joblib')
bagging = joblib.load('Bagging_model.joblib')
# linear = joblib.load('LinearRegression_model.joblib')

# Predict the price using the best estimator
predicted_price_forest = forest.predict(car_instance_scaled)
# predicted_price_bagging = bagging.predict(car_instance_scaled)
# predicted_price_linear = linear.predict(car_instance_scaled)
print(f"Predicted Price Forest: {predicted_price_forest[0]}")
print(f"Predicted Price Bagging: {predicted_price_bagging[0]}")
# print(f"Predicted Price Linear: {predicted_price_linear[0]}")

   manufacturer        model  prod._year      category  leather_interior  \
0  20353.821549  9260.173913    7.606387  23957.405742                 1   

      fuel_type  engine_volume    mileage  cylinders  gear_box_type  \
0  10944.138226            3.5  12.133534        6.0   16293.358366   

   drive_wheels        doors         wheel         color  airbags  is_turbo  
0  19699.168985  18927.68391  19717.067758  15513.357523       12         0  
Predicted Price Forest: 19215.466666666667
Predicted Price Bagging: 19636.0




In [37]:
data

Unnamed: 0,price,manufacturer,model,prod._year,category,leather_interior,fuel_type,engine_volume,mileage,cylinders,gear_box_type,drive_wheels,doors,wheel,color,airbags,is_turbo
0,13328,20361.715730,9234.750000,7.606387,23959.440467,1,10943.435436,3.5,12.133534,6.0,16293.589151,19700.841207,17472.767137,19717.446864,15513.970011,12,0
1,16621,15005.221359,7349.722222,7.606885,23958.810107,0,17682.358303,3.0,12.165256,6.0,26301.014711,19699.976903,17472.577612,19717.251469,19056.173137,8,0
2,8467,14789.403333,12011.632099,7.604396,11631.998500,0,17683.212572,1.3,12.206078,4.0,14721.324401,18893.444508,17473.046906,9116.750554,19057.884365,2,0
3,3607,15767.813707,8022.142857,7.606885,23961.301302,1,10946.301297,2.5,12.037459,4.0,16294.345708,19703.392651,17473.326619,19718.023675,18436.073588,0,0
4,11726,14785.782222,12003.585185,7.608374,11630.776528,1,17682.871137,1.3,11.428478,4.0,16293.713830,18893.179139,17472.859338,19717.541921,15514.419002,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19232,8467,18611.914905,7682.166667,7.600902,21596.086275,1,8393.520681,2.0,12.611541,4.0,27861.291491,17746.817108,53811.185754,19717.735299,15515.332399,5,1
19233,15681,22496.198807,14593.968300,7.606885,14462.589039,1,17682.456784,2.4,11.992886,4.0,26301.328987,18892.857096,17472.631712,19717.307245,14128.558419,8,0
19234,26108,22493.370762,30405.453461,7.606387,23956.994066,1,31175.277507,2.0,11.664496,4.0,16292.594521,18892.008061,17472.031597,19716.688542,19452.873239,4,0
19235,5331,15016.182524,17842.057554,7.604894,23960.971286,1,31180.606309,2.0,10.844646,4.0,16294.211534,18893.699862,17473.227396,19717.921379,19058.542497,4,0


In [41]:
car_instance_encoded

Unnamed: 0,manufacturer,model,prod._year,category,leather_interior,fuel_type,engine_volume,mileage,cylinders,gear_box_type,drive_wheels,doors,wheel,color,airbags,is_turbo
0,20353.821549,9260.173913,7.606387,23957.405742,1,10944.138226,3.5,12.133534,6.0,16293.358366,19699.168985,18927.68391,19717.067758,15513.357523,12,0
