In [1]:
import tensorflow as tf
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas as pd
from math import sqrt
import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
import seaborn as sns
import pickle
import warnings
warnings.filterwarnings('ignore')

In [15]:
data = pd.read_csv('D:/ML/data/cars.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38531 entries, 0 to 38530
Data columns (total 30 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   manufacturer_name  38531 non-null  object 
 1   model_name         38531 non-null  object 
 2   transmission       38531 non-null  object 
 3   color              38531 non-null  object 
 4   odometer_value     38531 non-null  int64  
 5   year_produced      38531 non-null  int64  
 6   engine_fuel        38531 non-null  object 
 7   engine_has_gas     38531 non-null  bool   
 8   engine_type        38531 non-null  object 
 9   engine_capacity    38521 non-null  float64
 10  body_type          38531 non-null  object 
 11  has_warranty       38531 non-null  bool   
 12  state              38531 non-null  object 
 13  drivetrain         38531 non-null  object 
 14  price_usd          38531 non-null  float64
 15  is_exchangeable    38531 non-null  bool   
 16  location_region    385

In [16]:
data = data.loc[data.price_usd > 10]
data.describe()

Unnamed: 0,odometer_value,year_produced,engine_capacity,price_usd,number_of_photos,up_counter,duration_listed
count,38522.0,38522.0,38512.0,38522.0,38522.0,38522.0,38522.0
mean,248848.538134,2002.945953,2.055184,6641.521404,9.650122,16.309589,80.59566
std,136016.097803,8.065156,0.671209,6428.102489,6.093379,43.291384,112.833317
min,0.0,1942.0,0.2,60.0,1.0,1.0,0.0
25%,158000.0,1998.0,1.6,2100.0,5.0,2.0,23.0
50%,250000.0,2003.0,2.0,4800.0,8.0,5.0,59.0
75%,325000.0,2009.0,2.3,8990.0,12.0,16.0,91.0
max,1000000.0,2019.0,8.0,50000.0,86.0,1861.0,2232.0


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38522 entries, 0 to 38530
Data columns (total 30 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   manufacturer_name  38522 non-null  object 
 1   model_name         38522 non-null  object 
 2   transmission       38522 non-null  object 
 3   color              38522 non-null  object 
 4   odometer_value     38522 non-null  int64  
 5   year_produced      38522 non-null  int64  
 6   engine_fuel        38522 non-null  object 
 7   engine_has_gas     38522 non-null  bool   
 8   engine_type        38522 non-null  object 
 9   engine_capacity    38512 non-null  float64
 10  body_type          38522 non-null  object 
 11  has_warranty       38522 non-null  bool   
 12  state              38522 non-null  object 
 13  drivetrain         38522 non-null  object 
 14  price_usd          38522 non-null  float64
 15  is_exchangeable    38522 non-null  bool   
 16  location_region    385

In [21]:
data["engine_capacity"] = data["engine_capacity"].fillna(data["engine_capacity"].mean())
maping = {
    True : 1,
    False : 0,
    'Минская обл.' : 5,
    'Гомельская обл.' : 3,
    'Брестская обл.' : 1,
    'Могилевская обл.' : 6,
    'Витебская обл.' : 2,
    'Гродненская обл.' : 4,
    'automatic' : 1,
    'mechanical' : -1
}
data = data.replace(maping)
category_col = [
    "color",
    "engine_fuel",
    "engine_type",
    "body_type",
    "state",
    "drivetrain",
    "model_name",
    "manufacturer_name"
]
data = pd.get_dummies(data=data, columns=category_col)
data.to_csv('D:/ML/data/cars_preprocessed.csv', index=False)

In [2]:
data = pd.read_csv('D:/ML/data/cars_preprocessed.csv')
y = data[["price_usd"]]
X = data.drop(["price_usd"], axis=1)
data

Unnamed: 0,transmission,odometer_value,year_produced,engine_has_gas,engine_capacity,has_warranty,price_usd,is_exchangeable,location_region,number_of_photos,...,manufacturer_name_Subaru,manufacturer_name_Suzuki,manufacturer_name_Toyota,manufacturer_name_Volkswagen,manufacturer_name_Volvo,manufacturer_name_ВАЗ,manufacturer_name_ГАЗ,manufacturer_name_ЗАЗ,manufacturer_name_Москвич,manufacturer_name_УАЗ
0,1,190000,2010,0,2.5,0,10900.00,0,5,9,...,1,0,0,0,0,0,0,0,0,0
1,1,290000,2002,0,3.0,0,5000.00,1,5,12,...,1,0,0,0,0,0,0,0,0,0
2,1,402000,2001,0,2.5,0,2800.00,1,5,4,...,1,0,0,0,0,0,0,0,0,0
3,-1,10000,1999,0,3.0,0,9999.00,1,5,9,...,1,0,0,0,0,0,0,0,0,0
4,1,280000,2001,0,2.5,0,2134.11,1,3,14,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38526,1,290000,2000,0,3.5,0,2750.00,1,5,5,...,0,0,0,0,0,0,0,0,0,0
38527,-1,321000,2004,0,2.2,0,4800.00,1,1,4,...,0,0,0,0,0,0,0,0,0,0
38528,1,777957,2000,0,3.5,0,4300.00,0,5,3,...,0,0,0,0,0,0,0,0,0,0
38529,-1,20000,2001,0,2.0,0,4000.00,1,1,7,...,0,0,0,0,0,0,0,0,0,0


In [46]:
pca = PCA(n_components=8)
X_pca = pca.fit_transform(X, y)

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, shuffle=False)

In [58]:
model_regression = tf.keras.Sequential(
    [
        tf.keras.layers.Dense(64, activation="linear", input_shape=(X_train.shape[1],)),
        tf.keras.layers.Dense(32, activation="relu"),
        tf.keras.layers.Dense(16, activation="linear"),
        tf.keras.layers.Dense(8, activation="relu"),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(1, activation="linear"),
    ]
)
model_regression.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.005), loss="mae")
model_regression.fit(X_train, y_train, epochs=20, verbose=False)

<keras.callbacks.History at 0x267dabfb040>

In [59]:
y_pred = model_regression.predict(X_test)
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {(mean_absolute_percentage_error(y_test, y_pred))}')
print(f'R^2: {r2_score(y_test, y_pred)}')
model_regression.save('D:/ML/model/cars_preprocessed_tensor.csv')

MAE: 2981.437361501621
MSE: 25053739.864218276
RMSE: 5005.371101548643
MAPE: 1.3079520828205096
R^2: 0.442942695082174




INFO:tensorflow:Assets written to: D:/ML/model/cars_preprocessed_tensor.csv\assets


INFO:tensorflow:Assets written to: D:/ML/model/cars_preprocessed_tensor.csv\assets
