In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from category_encoders import LeaveOneOutEncoder
import joblib

data = pd.read_csv("C:/Users/arbac/Downloads/LicentaAIModel/car_price_prediction.csv")
#----------------------------------------------------------------------------
# DUPLICATE ROWS

# Check for duplicate rows in the DataFrame
duplicate_rows = data[data.duplicated()]

# Remove duplicate rows
data = data.drop_duplicates()

#----------------------------------------------------------------------------
#DATA FRAME PROCESSING

# Assuming `data` is your DataFrame
# Transform the name of the columns to snake case
data.columns = data.columns.str.lower().str.replace(' ', '_').str.replace('-', '_')
#----------------------------------------------------------------------------
# ENGINE VOLUME

# Ensure all values in 'engine_volume' are strings
data['engine_volume'] = data['engine_volume'].astype(str)

# Create a new binary column 'is_turbo'
data['is_turbo'] = data['engine_volume'].apply(lambda x: 1 if 'Turbo' in x else 0)

# Extract the numeric part of 'engine_volume'
data['engine_volume'] = data['engine_volume'].str.replace(' Turbo', '').astype(float)

#----------------------------------------------------------------------------
#DROP INVALID ROWS

# Replace '-' with NaN
data.replace('-', np.nan, inplace=True)

# Handle NaN and infinity values
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.dropna(inplace=True)

#----------------------------------------------------------------------------
# DROP UNNECESSARY COLUMS

data=data.drop(['id'], axis=1)
data=data.drop(['levy'], axis=1)

#----------------------------------------------------------------------------
# MILEAGE

# Remove ' km' from 'mileage' and convert to int
data['mileage'] = data['mileage'].str.replace(' km', '').astype(int)

#----------------------------------------------------------------------------
# LEATHER INTERIOR

# Map 'leather_interior' to binary values
data['leather_interior'] = data['leather_interior'].map({'Yes': 1, 'No': 0})

# Ensure all values in 'prod_year' and 'mileage' are positive
data = data[(data['prod._year'] > 0) & (data['mileage'] > 0)]

data['prod._year'] = np.log(data['prod._year'] + 1)
data['mileage'] = np.log(data['mileage'] + 1)


#----------------------------------------------------------------------------
# ENCODING

categorical_columns = ['manufacturer', 'model', 'category', 'fuel_type', 'gear_box_type', 'drive_wheels', 'doors',
                       'wheel', 'color']

# # Apply one-hot encoding to categorical columns
# data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Apply LeaveOneOut encoding to categorical columns
encoder = LeaveOneOutEncoder(cols=categorical_columns)
data = encoder.fit_transform(data, data['price'])

#----------------------------------------------------------------------------
#SPLITTING

# Assuming 'price' is the target variable and all other columns are features
x = data.drop(['price'], axis=1)
y = data['price']

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

#----------------------------------------------------------------------------
#SCALING

# Optionally, scale the data
scaler = StandardScaler()
x_train_s = scaler.fit_transform(x_train)
x_test_s = scaler.transform(x_test)

#----------------------------------------------------------------------------
#LINEAR REGRESSION

# Fit the linear regression mode
reg = LinearRegression()

# reg.fit(x_train_s, y_train)
# # Evaluate the model
# y_pred = reg.predict(x_test)
# reg.score(x_test, y_test)


#----------------------------------------------------------------------------
# RANDOM FOREST

forest = RandomForestRegressor(n_estimators=150)

# forest.fit(x_train, y_train)
# y_pred = reg.predict(x_test)
# forest.score(x_test,y_test)

#------------------------------------------------------------------------------
#BAGGING


bagging = BaggingRegressor()
# bagging.fit(x_train, y_train)

# y_pred = reg.predict(x_test)
# bagging.score(x_test,y_test)


models = {'RandomForest': forest, 'Bagging': bagging, 'LinearRegression': reg}

# Results dictionary
results = {'Model': [], 'MAE (Train)': [], 'MAE (Test)': [], 'R-squared (Train)': [], 'R-squared (Test)': []}

# Loop through models
for model_name, model in models.items():
    # Train the model on the scaled training data
    model.fit(x_train_s, y_train)

    # Save the trained model
    joblib.dump(model, f'{model_name}_model.joblib')

    # Predictions on the scaled training set
    y_train_pred = model.predict(x_train_s)

    # Predictions on the scaled evaluation set
    y_test_pred = model.predict(x_test_s)

    # Calculate MAE and R-squared for training and evaluation sets
    mae_train = mean_absolute_error(y_train, y_train_pred)
    mae_eval = mean_absolute_error(y_test, y_test_pred)
    r2_train = r2_score(y_train, y_train_pred)
    r2_test = r2_score(y_test, y_test_pred)

    # Append results to the dictionary
    results['Model'].append(model_name)
    results['MAE (Train)'].append(int(mae_train))
    results['MAE (Test)'].append(int(mae_eval))
    results['R-squared (Train)'].append(round(r2_train, 3))
    results['R-squared (Test)'].append(round(r2_test, 3))

# Save the scaler, encoder, and training columns
joblib.dump(scaler, 'scaler.joblib')
joblib.dump(encoder, 'encoder.joblib')
joblib.dump(x_train.columns, 'train_columns.joblib')

# Create DataFrame from results dictionary
results_df = pd.DataFrame(results)

# Display the results
results_df

Unnamed: 0,Model,MAE (Train),MAE (Test),R-squared (Train),R-squared (Test)
0,RandomForest,168,198,0.957,0.942
1,Bagging,126,327,0.98,0.86
2,LinearRegression,9317,10005,0.313,0.379


In [2]:

data

Unnamed: 0,price,manufacturer,model,prod._year,category,leather_interior,fuel_type,engine_volume,mileage,cylinders,gear_box_type,drive_wheels,doors,wheel,color,airbags,is_turbo
0,13328,17779.813158,10197.742857,7.606387,23509.840839,1,9014.238818,3.5,12.133534,6.0,16443.015455,17522.972673,17651.320185,17842.408584,15939.505599,12.0,0
1,16621,14029.105960,2076.428571,7.606885,23508.155578,0,18109.185291,3.0,12.165256,6.0,28940.006279,17520.160547,17650.776696,17841.860755,18472.167372,8.0,0
3,3607,14351.136778,4546.702128,7.606885,23514.815763,1,9022.003195,2.5,12.037459,4.0,16444.870254,17531.274125,17652.924575,17844.025786,18547.312016,0.0,0
4,11726,18674.447876,15722.576923,7.608374,12008.375163,1,18110.743712,1.3,11.428478,4.0,16443.321122,17755.678765,17651.584585,17842.675096,15940.885444,4.0,0
5,39493,23075.948060,39795.887160,7.609367,23496.450358,1,25058.264543,2.0,11.988737,4.0,16438.023087,17749.643773,17647.001815,17838.055731,18524.129845,4.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9154,20005,23087.582687,16437.084615,7.606885,13923.440432,1,18108.107927,1.6,11.576998,4.0,16441.741462,17753.879374,17650.218188,17841.297787,18470.122659,4.0,0
9155,16778,14028.759382,22011.500000,7.608871,23508.075230,1,13364.424051,2.0,10.443746,4.0,16442.357184,17754.580743,17650.750784,17841.834636,18773.251202,4.0,0
9156,26657,10486.448980,30248.666667,7.608374,23503.019447,0,18105.990131,2.5,11.527183,4.0,16440.472238,17752.433601,17649.120317,17840.191150,18466.103323,10.0,0
9157,21953,9807.984252,20508.000000,7.606885,23505.426817,1,18107.487743,4.0,11.285196,6.0,16441.369777,17515.607173,17649.896683,17840.973715,15932.076658,6.0,0


In [3]:
#------------------------------------------------------------------------------
#BEST APROXIMATION
forest = RandomForestRegressor()
param_grid = {
    "n_estimators": [150, 200],
}

grid_search = GridSearchCV(forest, param_grid, cv=5,
                           scoring="neg_mean_squared_error",
                           return_train_score=True)
grid_search.fit(x_train, y_train)

best_forest = grid_search.best_estimator_
print("Best Estimator:", best_forest)
best_forest.score(x_test, y_test)

Best Estimator: RandomForestRegressor(n_estimators=200)


0.9395957709100385

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6174 entries, 0 to 9158
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   price             6174 non-null   int64  
 1   manufacturer      6174 non-null   float64
 2   model             6174 non-null   float64
 3   prod._year        6174 non-null   float64
 4   category          6174 non-null   float64
 5   leather_interior  6174 non-null   int64  
 6   fuel_type         6174 non-null   float64
 7   engine_volume     6174 non-null   float64
 8   mileage           6174 non-null   float64
 9   cylinders         6174 non-null   float64
 10  gear_box_type     6174 non-null   float64
 11  drive_wheels      6174 non-null   float64
 12  doors             6174 non-null   float64
 13  wheel             6174 non-null   float64
 14  color             6174 non-null   float64
 15  airbags           6174 non-null   float64
 16  is_turbo          6174 non-null   int64  


In [7]:
# Define the car instance
car_instance = pd.DataFrame({
    'manufacturer': ['LEXUS'],
    'model': ['RX 450'],
    'prod._year': [2010],
    'category': ['Jeep'],
    'leather_interior': ['Yes'],
    'fuel_type': ['Hybrid'],
    'engine_volume': [3.5],
    'mileage': ['186005 km'],
    'cylinders': [6.0],
    'gear_box_type': ['Automatic'],
    'drive_wheels': ['4x4'],
    'doors': ['4-May'],
    'wheel': ['Left wheel'],
    'color': ['Silver'],
    'airbags':[12.0]
})

# Preprocess the specific car instance
car_instance.columns = car_instance.columns.str.lower().str.replace(' ', '_').str.replace('-', '_')
car_instance['leather_interior'] = car_instance['leather_interior'].map({'Yes': 1, 'No': 0})
car_instance['mileage'] = car_instance['mileage'].str.replace(' km', '').astype(int)
car_instance['engine_volume'] = car_instance['engine_volume'].astype(str)
car_instance['is_turbo'] = car_instance['engine_volume'].apply(lambda x: 1 if 'Turbo' in x else 0)
car_instance['engine_volume'] = car_instance['engine_volume'].str.replace(' Turbo', '').astype(float)

# Apply logarithmic transformation to the car instance
car_instance['prod._year'] = np.log(car_instance['prod._year'] + 1)
car_instance['mileage'] = np.log(car_instance['mileage'] + 1)

# Temporarily add the 'price' column to align with the encoder's expected input dimensions
car_instance['price'] = 0

# Load the encoder that was fitted on the training data
encoder = joblib.load('encoder.joblib')

# Apply LeaveOneOut encoding to categorical columns using the same encoder as the training data
categorical_columns = ['manufacturer', 'model', 'category', 'fuel_type', 'gear_box_type', 'drive_wheels', 'doors', 'wheel', 'color']
car_instance_encoded = encoder.transform(car_instance)

# Drop the temporary 'price' column
car_instance_encoded = car_instance_encoded.drop(columns=['price'])

# Ensure that we are not duplicating any columns
encoded_columns = [col for col in car_instance_encoded.columns if col not in car_instance.columns]
car_instance = car_instance.drop(columns=categorical_columns)
car_instance = pd.concat([car_instance, car_instance_encoded[encoded_columns]], axis=1)

# Ensure all columns match the training data
x_train_columns = joblib.load('train_columns.joblib')  # Get the columns from the training data
car_instance_encoded = car_instance_encoded.reindex(columns=x_train_columns, fill_value=0)

# Load the scaler and scale the features using the same scaler as the training data
scaler = joblib.load('scaler.joblib')
car_instance_scaled = scaler.transform(car_instance_encoded)

# Load the best estimator model
best_forest = joblib.load('RandomForest_model.joblib')

# Predict the price using the best estimator
predicted_price = forest.predict(car_instance_scaled)
print(f"Predicted Price: {predicted_price[0]}")


Predicted Price: 17736.786666666667


In [6]:
car_instance_encoded

Unnamed: 0,manufacturer,model,prod._year,category,leather_interior,fuel_type,engine_volume,mileage,cylinders,gear_box_type,drive_wheels,doors,wheel,color,airbags,is_turbo
0,17768.128609,10241.830986,7.606387,23504.632737,1,9017.681564,3.5,12.133534,6.0,16442.421213,17519.393345,17650.606766,17841.657685,15937.258176,12.0,0


In [21]:
predicted_price


array([17729.38])