In [26]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import pandas as pd
from sklearn.metrics import accuracy_score, r2_score
import numpy as np
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [27]:
data= pd.read_csv("C:\\Users\\MASSIVE\\Price-Predictor-ML-\\cleaned_data.csv")

In [28]:
data['titre'] = data['specs_marque-voiture'] + data['specs_modele'] + data['specs_finition']


In [29]:
data.drop(columns=['specs_car_engine'], axis=1, inplace=True)
data.drop(columns=['specs_energie'], axis=1, inplace=True)
data.drop(columns=['id'], axis=1, inplace=True)
data.drop(columns=['specs_boite'], axis=1, inplace=True)
data.drop(columns=['title'], axis=1, inplace=True)

In [30]:
data['engine'] = data['Engine Capacity'] +data['Engine_Name_Match'] +  data['horsepower']

In [31]:
print(data.columns)
data.isnull().sum()

Index(['price', 'specs_annee', 'specs_couleur_auto', 'specs_papiers',
       'specs_kilometrage', 'specs_marque-voiture', 'specs_modele',
       'specs_finition', 'options', 'Engine Capacity', 'Engine_Name_Match',
       'horsepower', 'titre', 'engine'],
      dtype='object')


price                   0
specs_annee             0
specs_couleur_auto      0
specs_papiers           0
specs_kilometrage       0
specs_marque-voiture    0
specs_modele            0
specs_finition          0
options                 0
Engine Capacity         0
Engine_Name_Match       0
horsepower              0
titre                   0
engine                  0
dtype: int64

In [32]:
data['car_age'] = 2024 - data['specs_annee']

# 2. Engine-Based Features
data['engine_capacity_bin'] = pd.cut(data['Engine Capacity'], bins=[0, 9, 18, 25], labels=[1, 2,3])

# 3. Kilometrage-Based Features
data['kilometrage_bin'] = pd.cut(data['specs_kilometrage'], bins=[0, 200000, 400000, 600000,9999999], labels=[1, 2,3,4])

# 5. Price-Based Features
data['log_price'] = np.log(data['price'])

In [33]:
print(data.columns)
data.isnull().sum()

Index(['price', 'specs_annee', 'specs_couleur_auto', 'specs_papiers',
       'specs_kilometrage', 'specs_marque-voiture', 'specs_modele',
       'specs_finition', 'options', 'Engine Capacity', 'Engine_Name_Match',
       'horsepower', 'titre', 'engine', 'car_age', 'engine_capacity_bin',
       'kilometrage_bin', 'log_price'],
      dtype='object')


price                      0
specs_annee                0
specs_couleur_auto         0
specs_papiers              0
specs_kilometrage          0
specs_marque-voiture       0
specs_modele               0
specs_finition             0
options                    0
Engine Capacity            0
Engine_Name_Match          0
horsepower                 0
titre                      0
engine                     0
car_age                    0
engine_capacity_bin        4
kilometrage_bin         2135
log_price                  0
dtype: int64

In [34]:
data['price_per_kilometer'] = data['price'] / data['specs_kilometrage']
data['price_squared_per_kilometer'] = data['price'] ** 2 / data['specs_kilometrage']
data['horsepower_per_engine_capacity'] = data['horsepower'] / data['Engine Capacity']
data['price_per_horsepower'] = data['price'] / data['horsepower']
data['log_price_per_kilometer'] = np.log(data['price']) / data['specs_kilometrage']
data['price_per_car_age'] = data['price'] / data['car_age']
data['engine_capacity_per_car_age'] = data['Engine Capacity'] / data['car_age']
data['kilometers_per_year'] = data['specs_kilometrage'] / data['car_age']

# Replace inf and NaN values that might be caused by division by zero
data.replace([np.inf, -np.inf], np.nan, inplace=True)


In [39]:
data.loc[1, 'price'] = np.nan
data.loc[2, 'specs_finition'] = np.nan

# Separate the categorical and numerical columns
categorical_cols = data.select_dtypes(include=['category', 'object']).columns
numerical_cols = data.select_dtypes(include=[np.number]).columns

# Fill NaN values in numerical columns
data[numerical_cols] = data[numerical_cols].fillna(0)

# Fill NaN values in categorical columns by temporarily converting them to object type
data[categorical_cols] = data[categorical_cols].astype('object').fillna('Unknown')

# Convert categorical columns back to category type if needed
data[categorical_cols] = data[categorical_cols].astype('category')

In [45]:
data['engine_capacity_bin'] = data['engine_capacity_bin'].cat.codes
data['kilometrage_bin'] = data['kilometrage_bin'].cat.codes

In [46]:
print(data.columns)
data.isnull().sum()

Index(['price', 'specs_annee', 'specs_couleur_auto', 'specs_papiers',
       'specs_kilometrage', 'specs_marque-voiture', 'specs_modele',
       'specs_finition', 'options', 'Engine Capacity', 'Engine_Name_Match',
       'horsepower', 'titre', 'engine', 'car_age', 'engine_capacity_bin',
       'kilometrage_bin', 'log_price', 'price_per_kilometer',
       'price_squared_per_kilometer', 'horsepower_per_engine_capacity',
       'price_per_horsepower', 'log_price_per_kilometer', 'price_per_car_age',
       'engine_capacity_per_car_age', 'kilometers_per_year'],
      dtype='object')


price                             0
specs_annee                       0
specs_couleur_auto                0
specs_papiers                     0
specs_kilometrage                 0
specs_marque-voiture              0
specs_modele                      0
specs_finition                    0
options                           0
Engine Capacity                   0
Engine_Name_Match                 0
horsepower                        0
titre                             0
engine                            0
car_age                           0
engine_capacity_bin               0
kilometrage_bin                   0
log_price                         0
price_per_kilometer               0
price_squared_per_kilometer       0
horsepower_per_engine_capacity    0
price_per_horsepower              0
log_price_per_kilometer           0
price_per_car_age                 0
engine_capacity_per_car_age       0
kilometers_per_year               0
dtype: int64

In [47]:
print(data.dtypes)

price                             float64
specs_annee                         int64
specs_couleur_auto                  int64
specs_papiers                     float64
specs_kilometrage                 float64
specs_marque-voiture                int64
specs_modele                        int64
specs_finition                    float64
options                             int64
Engine Capacity                   float64
Engine_Name_Match                   int64
horsepower                        float64
titre                               int64
engine                            float64
car_age                             int64
engine_capacity_bin                  int8
kilometrage_bin                      int8
log_price                         float64
price_per_kilometer               float64
price_squared_per_kilometer       float64
horsepower_per_engine_capacity    float64
price_per_horsepower              float64
log_price_per_kilometer           float64
price_per_car_age                 

In [48]:
# Splitting data into features and target
X = data.drop(columns=['price'])  # Features
y = data['price']  # Target

# Normalizing data
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)


In [49]:
models = {
    'Linear Regression': LinearRegression(),
    'Decision Trees': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Support Vector Machines (SVM)': SVR(),
    'K-Nearest Neighbors (KNN)': KNeighborsRegressor(),
    'Gradient Boosting Machines (GBM)': GradientBoostingRegressor(),
    'Neural Networks': MLPRegressor(),
}

In [None]:
model = RandomForestRegressor()

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

# Print the evaluation metrics
print(f"R-squared: {r2}")
print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")

In [50]:
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np
import pandas as pd

# Assuming you have X_train, y_train, X_test, y_test ready

# Convert y_train and y_test to NumPy arrays
y_train_np = y_train.values.reshape(-1, 1)
y_test_np = y_test.values.reshape(-1, 1)

# Discretize the target variable into bins
n_bins = 10  # Adjust the number of bins as needed
discretizer = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
y_train_bins = discretizer.fit_transform(y_train_np)
y_test_bins = discretizer.transform(y_test_np)

# Train Gaussian Naive Bayes classifier
model = GaussianNB()
model.fit(X_train, y_train_bins.ravel())

# Predict the bin labels
y_pred_bins = model.predict(X_test)

# Convert bin labels back to continuous values
y_pred_continuous = discretizer.inverse_transform(y_pred_bins.reshape(-1, 1))

# Calculate evaluation metrics
r2 = r2_score(y_test_np, y_pred_continuous)
mae = mean_absolute_error(y_test_np, y_pred_continuous)
mse = mean_squared_error(y_test_np, y_pred_continuous)
rmse = np.sqrt(mse)

# Create a DataFrame to store the results
results_df = pd.DataFrame({
    'Model': ['Gaussian Naive Bayes'],
    'R-squared': [r2],
    'MAE': [mae],
    'MSE': [mse],
    'RMSE': [rmse]
})

# Display the results
print(results_df)


                  Model  R-squared         MAE           MSE        RMSE
0  Gaussian Naive Bayes   0.730933  256.294364  87281.377536  295.434219


In [51]:
results = []
index = []

for name, model in models.items():
    # Fit the model
    model.fit(X_train, y_train)
    
    # Predict on test set
    y_pred = model.predict(X_test)
    
    # Calculate accuracy (only for classification models)
    #accuracy = accuracy_score(y_test, y_pred) if name != 'Linear Regression' else None
    
    # Calculate R-squared score
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    
    # Append results to lists
    index.append(name)
    results.append({
        'R-squared': r2,
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse
    })

# Convert results to DataFrame for better visualization
results_df = pd.DataFrame(results, index=index)

# Display the results
print(results_df)

                                  R-squared        MAE            MSE  \
Linear Regression                  0.953223  42.593492   15173.593014   
Decision Trees                     0.999911   0.256792      28.754021   
Random Forest                      0.999981   0.180460       6.123901   
Support Vector Machines (SVM)      0.276235  95.802377  234778.379260   
K-Nearest Neighbors (KNN)          0.951029  43.401866   15885.576522   
Gradient Boosting Machines (GBM)   0.999942   2.339039      18.789411   
Neural Networks                    0.992743  15.377461    2354.173646   

                                        RMSE  
Linear Regression                 123.181139  
Decision Trees                      5.362278  
Random Forest                       2.474652  
Support Vector Machines (SVM)     484.539347  
K-Nearest Neighbors (KNN)         126.037996  
Gradient Boosting Machines (GBM)    4.334675  
Neural Networks                    48.519827  




In [53]:
import joblib

In [54]:
joblib.dump(RandomForestRegressor, 'random_forest_model.joblib')

['random_forest_model.joblib']