# Car Sales Price Prediction ANN

In this notebook, I will be using the [Car Sales Price Dataset](https://www.kaggle.com/datasets/yashpaloswal/ann-car-sales-price-prediction/data) from Kaggle.

In [84]:
import janitor
import pandas as pd
from tensorflow import keras
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder

In [85]:
df = pd.read_csv(r'..\dataset\car_purchasing.csv', encoding='latin1')
df.head()

Unnamed: 0,customer name,customer e-mail,country,gender,age,annual Salary,credit card debt,net worth,car purchase amount
0,Martina Avila,cubilia.Curae.Phasellus@quisaccumsanconvallis.edu,Bulgaria,0,41.85172,62812.09301,11609.38091,238961.2505,35321.45877
1,Harlan Barnes,eu.dolor@diam.co.uk,Belize,0,40.870623,66646.89292,9572.957136,530973.9078,45115.52566
2,Naomi Rodriquez,vulputate.mauris.sagittis@ametconsectetueradip...,Algeria,1,43.152897,53798.55112,11160.35506,638467.1773,42925.70921
3,Jade Cunningham,malesuada@dignissim.com,Cook Islands,1,58.271369,79370.03798,14426.16485,548599.0524,67422.36313
4,Cedric Leach,felis.ullamcorper.viverra@egetmollislectus.net,Brazil,1,57.313749,59729.1513,5358.712177,560304.0671,55915.46248


In [86]:
# cleaning the column names
df= janitor.clean_names(df=df)
df.head()

Unnamed: 0,customer_name,customer_e_mail,country,gender,age,annual_salary,credit_card_debt,net_worth,car_purchase_amount
0,Martina Avila,cubilia.Curae.Phasellus@quisaccumsanconvallis.edu,Bulgaria,0,41.85172,62812.09301,11609.38091,238961.2505,35321.45877
1,Harlan Barnes,eu.dolor@diam.co.uk,Belize,0,40.870623,66646.89292,9572.957136,530973.9078,45115.52566
2,Naomi Rodriquez,vulputate.mauris.sagittis@ametconsectetueradip...,Algeria,1,43.152897,53798.55112,11160.35506,638467.1773,42925.70921
3,Jade Cunningham,malesuada@dignissim.com,Cook Islands,1,58.271369,79370.03798,14426.16485,548599.0524,67422.36313
4,Cedric Leach,felis.ullamcorper.viverra@egetmollislectus.net,Brazil,1,57.313749,59729.1513,5358.712177,560304.0671,55915.46248


In [87]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   customer_name        500 non-null    object 
 1   customer_e_mail      500 non-null    object 
 2   country              500 non-null    object 
 3   gender               500 non-null    int64  
 4   age                  500 non-null    float64
 5   annual_salary        500 non-null    float64
 6   credit_card_debt     500 non-null    float64
 7   net_worth            500 non-null    float64
 8   car_purchase_amount  500 non-null    float64
dtypes: float64(5), int64(1), object(3)
memory usage: 35.3+ KB


In [88]:
df.describe()

Unnamed: 0,gender,age,annual_salary,credit_card_debt,net_worth,car_purchase_amount
count,500.0,500.0,500.0,500.0,500.0,500.0
mean,0.506,46.241674,62127.239608,9607.645049,431475.713625,44209.799218
std,0.500465,7.978862,11703.378228,3489.187973,173536.75634,10773.178744
min,0.0,20.0,20000.0,100.0,20000.0,9000.0
25%,0.0,40.949969,54391.977195,7397.515792,299824.1959,37629.89604
50%,1.0,46.049901,62915.497035,9655.035568,426750.12065,43997.78339
75%,1.0,51.612263,70117.862005,11798.867487,557324.478725,51254.709517
max,1.0,70.0,100000.0,20000.0,1000000.0,80000.0


In [89]:
# setting the target variable
target_column = 'car_purchase_amount'

# setting up the independent (x) and dependent (y) variables

x = df.drop(columns=[target_column])
y = df[target_column]

In [90]:
# identifying categorical and numerical values
categorical_columns = x.select_dtypes('object').columns
numerical_columns = x.select_dtypes('number').columns

# preprocessing pipeline 
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
numerical_transformer = StandardScaler()

preprocesser = ColumnTransformer([
    ("numerical", numerical_transformer, numerical_columns),
    ("categorical", categorical_transformer, categorical_columns),
])

x_transformed = preprocesser.fit_transform(x)

In [91]:
if y.dtype == 'object':
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)
    output_dimension = len(label_encoder.classes_)
    activation = 'softmax'
    loss='sparse_categorical_cross_entropy'
else:
    output_dimension = 1
    activation ='linear'
    loss='mse'

In [92]:
# performing train test split
x_train, x_test, y_train, y_test = train_test_split(x_transformed, y, test_size=0.2, random_state=42)

In [93]:
# defining an ANN
model = keras.Sequential([
    keras.layers.Input(shape=(x_train.shape[1],)),
    keras.layers.Dense(64, activation='tanh'),
    keras.layers.Dense(32, activation='tanh'),
    keras.layers.Dense(16, activation='tanh'),
    keras.layers.Dense(output_dimension, activation=activation)
])

In [94]:
# compiling the model
model.compile(optimizer='adam', loss=loss, metrics=['accuracy'] if output_dimension > 1 else ['mae'])

In [95]:
# fitting the model
model.fit(x_train, y_train, epochs=50, batch_size=32, validation_data=(x_test, y_test))

Epoch 1/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - loss: 2010272128.0000 - mae: 43537.3242 - val_loss: 2156833536.0000 - val_mae: 45264.3594
Epoch 2/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 2058818688.0000 - mae: 44065.7539 - val_loss: 2156766208.0000 - val_mae: 45263.6914
Epoch 3/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 2034702464.0000 - mae: 43695.6875 - val_loss: 2156652032.0000 - val_mae: 45262.5000
Epoch 4/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 2035306496.0000 - mae: 43849.4062 - val_loss: 2156524544.0000 - val_mae: 45261.0586
Epoch 5/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 2104984832.0000 - mae: 44539.1094 - val_loss: 2156433152.0000 - val_mae: 45259.9766
Epoch 6/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 2058938368.0000 - mae: 4395

<keras.src.callbacks.history.History at 0x198edfad160>

In [96]:
loss, metric = model.evaluate(x_test, y_test)
print("Test loss: {:.4f}, Metrics {:.4f}".format(loss, metric))

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 2217590272.0000 - mae: 45903.9180
Test loss: 2155382272.0000, Metrics 45248.3086
