In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import pandas as pd
from sklearn.metrics import accuracy_score, r2_score
import numpy as np
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [3]:
data= pd.read_csv("C:\\Users\\MASSIVE\\Price-Predictor-ML-\\cleaned_data.csv")

In [4]:
data.drop(columns=['specs_modele'], axis=1, inplace=True)
data.drop(columns=['specs_finition'], axis=1, inplace=True) 
data.drop(columns=['specs_car_engine'], axis=1, inplace=True)
data.drop(columns=['specs_energie'], axis=1, inplace=True)
data.drop(columns=['title'], axis=1, inplace=True)
data.drop(columns=['id'], axis=1, inplace=True)
data.drop(columns=['specs_boite'], axis=1, inplace=True)

In [5]:
print(data.columns)
data.isnull().sum()

Index(['price', 'specs_annee', 'specs_couleur_auto', 'specs_papiers',
       'specs_kilometrage', 'specs_marque-voiture', 'options',
       'Engine Capacity', 'Engine_Name_Match', 'horsepower'],
      dtype='object')


price                   0
specs_annee             0
specs_couleur_auto      0
specs_papiers           0
specs_kilometrage       0
specs_marque-voiture    0
options                 0
Engine Capacity         0
Engine_Name_Match       0
horsepower              0
dtype: int64

In [6]:
data['car_age'] = 2024 - data['specs_annee']

# 2. Engine-Based Features
data['engine_capacity_bin'] = pd.cut(data['Engine Capacity'], bins=[-990, 1.500, 2.000, 30000], labels=[1, 2,3])

# 3. Kilometrage-Based Features
data['kilometrage_bin'] = pd.cut(data['specs_kilometrage'], bins=[-9999, 200000, 400000, 600000,9999999], labels=[1, 2,3,4])

# 5. Price-Based Features
data['log_price'] = np.log(data['price'])

In [7]:
print(data.columns)
data.isnull().sum()

Index(['price', 'specs_annee', 'specs_couleur_auto', 'specs_papiers',
       'specs_kilometrage', 'specs_marque-voiture', 'options',
       'Engine Capacity', 'Engine_Name_Match', 'horsepower', 'car_age',
       'engine_capacity_bin', 'kilometrage_bin', 'log_price'],
      dtype='object')


price                   0
specs_annee             0
specs_couleur_auto      0
specs_papiers           0
specs_kilometrage       0
specs_marque-voiture    0
options                 0
Engine Capacity         0
Engine_Name_Match       0
horsepower              0
car_age                 0
engine_capacity_bin     0
kilometrage_bin         0
log_price               0
dtype: int64

In [8]:
data['price_per_kilometer'] = data['price'] / data['specs_kilometrage']
data['price_squared_per_kilometer'] = data['price'] ** 2 / data['specs_kilometrage']
data['horsepower_per_engine_capacity'] = data['horsepower'] / data['Engine Capacity']
data['price_per_horsepower'] = data['price'] / data['horsepower']
data['log_price_per_kilometer'] = np.log(data['price']) / data['specs_kilometrage']
data['price_per_car_age'] = data['price'] / data['car_age']
data['engine_capacity_per_car_age'] = data['Engine Capacity'] / data['car_age']
data['kilometers_per_year'] = data['specs_kilometrage'] / data['car_age']

# Replace inf and NaN values that might be caused by division by zero
data.replace([np.inf, -np.inf], np.nan, inplace=True)


In [9]:
data.loc[1, 'price'] = np.nan
data.loc[2, 'specs_finition'] = np.nan

# Separate the categorical and numerical columns
categorical_cols = data.select_dtypes(include=['category', 'object']).columns
numerical_cols = data.select_dtypes(include=[np.number]).columns

# Fill NaN values in numerical columns
data[numerical_cols] = data[numerical_cols].fillna(0)

# Fill NaN values in categorical columns by temporarily converting them to object type
data[categorical_cols] = data[categorical_cols].astype('object').fillna('Unknown')

# Convert categorical columns back to category type if needed
data[categorical_cols] = data[categorical_cols].astype('category')


In [10]:
print(data.columns)
data.isnull().sum()

Index(['price', 'specs_annee', 'specs_couleur_auto', 'specs_papiers',
       'specs_kilometrage', 'specs_marque-voiture', 'options',
       'Engine Capacity', 'Engine_Name_Match', 'horsepower', 'car_age',
       'engine_capacity_bin', 'kilometrage_bin', 'log_price',
       'price_per_kilometer', 'price_squared_per_kilometer',
       'horsepower_per_engine_capacity', 'price_per_horsepower',
       'log_price_per_kilometer', 'price_per_car_age',
       'engine_capacity_per_car_age', 'kilometers_per_year', 'specs_finition'],
      dtype='object')


price                             0
specs_annee                       0
specs_couleur_auto                0
specs_papiers                     0
specs_kilometrage                 0
specs_marque-voiture              0
options                           0
Engine Capacity                   0
Engine_Name_Match                 0
horsepower                        0
car_age                           0
engine_capacity_bin               0
kilometrage_bin                   0
log_price                         0
price_per_kilometer               0
price_squared_per_kilometer       0
horsepower_per_engine_capacity    0
price_per_horsepower              0
log_price_per_kilometer           0
price_per_car_age                 0
engine_capacity_per_car_age       0
kilometers_per_year               0
specs_finition                    0
dtype: int64

In [11]:
# Splitting data into features and target
X = data.drop(columns=['price'])  # Features
y = data['price']  # Target

# Normalizing data
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)


In [12]:
models = {
    'Linear Regression': LinearRegression(),
    'Decision Trees': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Support Vector Machines (SVM)': SVR(),
    'K-Nearest Neighbors (KNN)': KNeighborsRegressor(),
    'Naive Bayes': GaussianNB(),
    'Gradient Boosting Machines (GBM)': GradientBoostingRegressor(),
    'Neural Networks': MLPRegressor(),
}

In [15]:
m= LinearRegression()
m.fit(X_train, y_train)
y_pred = m.predict(X_test)
results = []
index = []
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
    
    # Append results to lists
index.append('Linear Regression')
results.append({
        'R-squared': r2,
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse
    })
results_df = pd.DataFrame(results, index=index)

# Display the results
print(results_df)

                   R-squared        MAE           MSE        RMSE
Linear Regression   0.953273  42.668342  15157.561608  123.116049


In [14]:
results = []
index = []

for name, model in models.items():
    # Fit the model
    model.fit(X_train, y_train)
    
    # Predict on test set
    y_pred = model.predict(X_test)
    
    # Calculate accuracy (only for classification models)
    #accuracy = accuracy_score(y_test, y_pred) if name != 'Linear Regression' else None
    
    # Calculate R-squared score
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    
    # Append results to lists
    index.append(name)
    results.append({
        'R-squared': r2,
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse
    })

# Convert results to DataFrame for better visualization
results_df = pd.DataFrame(results, index=index)

# Display the results
print(results_df)

KeyboardInterrupt: 

In [None]:
results_df = pd.DataFrame(results, index=index)

# Display the results
print(results_df)