Importing the data

In [78]:
import pandas as pd
data = pd.read_csv('Final_df.csv')

Data Pre-Processing

In [79]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

data_copy = data
data_types = data_copy.dtypes

print(data.head(2))

   region  price manufacturer           model condition    cylinders fuel  \
0  others  33590          gmc          others      good  8 cylinders  gas   
1  others  22590    chevrolet  silverado 1500      good  8 cylinders  gas   

   odometer title_status transmission    drive    type paint_color  car_age  
0     57923        clean        other  unknown  pickup       white        8  
1     71229        clean        other  unknown  pickup        blue       12  


In [81]:
data_copy = data_copy.dropna(subset=['odometer', 'manufacturer', 'model'])

In [82]:
data_copy.fillna('unknown', inplace=True)

In [83]:
data_copy = data_copy.drop_duplicates()

In [84]:
manufacturer_values = data_copy['manufacturer'].value_counts()
data_copy['manufacturer'] =  data_copy['manufacturer'].apply(lambda x: x if str(x) in manufacturer_values[:20] else 'others')

In [85]:
region_values = data_copy['region'].value_counts()
data_copy['region'] = data_copy['region'].apply(lambda x: x if str(x) in region_values[:50] else 'others')
model_values = data_copy['model'].value_counts()
data_copy['model'] = data_copy['model'].apply(lambda x: x if str(x) in model_values[:50] else 'others')

In [86]:
price_percentile25 = data_copy['price'].quantile(0.25)
price_percentile75 = data_copy['price'].quantile(0.75)
price_iqr = price_percentile75 - price_percentile25
price_upper_limit = price_percentile75 + 1.5 * price_iqr
price_lower_limit = data_copy['price'].quantile(0.15)
new_df = data_copy[(data_copy['price'] < price_upper_limit) & (data_copy['price'] > price_lower_limit)]
odometer_percentile75 = data_copy['odometer'].quantile(0.75)
odometer_percentile25 = data_copy['odometer'].quantile(0.25)
odometer_iqr = odometer_percentile75 - odometer_percentile25
odometer_upper_limit = odometer_percentile75 + 1.5 * odometer_iqr
odometer_lower_limit = data_copy['odometer'].quantile(0.05)
new_df = new_df[(new_df['odometer'] < odometer_upper_limit) & (new_df['odometer'] > odometer_lower_limit)]

In [88]:
new_df['odometer'] = new_df['odometer'].astype(int)

In [None]:
# new_df = new_df[new_df['year'] > 1996]
# new_df.shape
# new_df['car_age'] = 2022 - new_df['year']
# new_df.drop(['year'], axis = 1, inplace = True)

In [90]:
final_df = new_df.copy()

Split data into Train and Test

In [91]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(final_df.drop(['price'], axis = 1), final_df['price'], random_state = 42, test_size = .2)

Tensorflow Training Starts from here

In [None]:
!pip install tensorflow tensorflow_decision_forests

In [93]:
import tensorflow_decision_forests as tfdf

In [94]:
X_train['price'] = y_train

In [95]:
from tensorflow_decision_forests.tensorflow import core_inference as tf_core
import time

tf_dataset = tfdf.keras.pd_dataframe_to_tf_dataset(X_train, task = tf_core.Task.REGRESSION, label='price')
model_1 = tfdf.keras.RandomForestModel(tf_core.Task.REGRESSION,verbose=2)

start = time.time()
model_1.fit(tf_dataset)
end = time.time()
print("Training time in seconds: ")
print(end - start)


Use 2 thread(s) for training
Use /tmp/tmpp5_rln2k as temporary training directory
Reading training dataset...
Training tensor examples:
Features: {'region': <tf.Tensor 'data:0' shape=(None,) dtype=string>, 'manufacturer': <tf.Tensor 'data_1:0' shape=(None,) dtype=string>, 'model': <tf.Tensor 'data_2:0' shape=(None,) dtype=string>, 'condition': <tf.Tensor 'data_3:0' shape=(None,) dtype=string>, 'cylinders': <tf.Tensor 'data_4:0' shape=(None,) dtype=string>, 'fuel': <tf.Tensor 'data_5:0' shape=(None,) dtype=string>, 'odometer': <tf.Tensor 'data_6:0' shape=(None,) dtype=int64>, 'title_status': <tf.Tensor 'data_7:0' shape=(None,) dtype=string>, 'transmission': <tf.Tensor 'data_8:0' shape=(None,) dtype=string>, 'drive': <tf.Tensor 'data_9:0' shape=(None,) dtype=string>, 'type': <tf.Tensor 'data_10:0' shape=(None,) dtype=string>, 'paint_color': <tf.Tensor 'data_11:0' shape=(None,) dtype=string>, 'car_age': <tf.Tensor 'data_12:0' shape=(None,) dtype=int64>}
Label: Tensor("data_13:0", shape=(N

[INFO 23-12-03 03:14:52.2349 UTC kernel.cc:771] Start Yggdrasil model training
[INFO 23-12-03 03:14:52.2350 UTC kernel.cc:772] Collect training examples
[INFO 23-12-03 03:14:52.2350 UTC kernel.cc:785] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: NUMERICAL
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

[INFO 23-12-03 03:14:52.2352 UTC kernel.cc:391] Number of batches: 126
[INFO 23-12-03 03:14:52.2352 UTC kernel.cc:392] Number of examples: 125127
[INFO 23-12-03 03:14:52.4570 UTC kernel.cc:792] Training dataset:
Number of records: 125127
Number of columns: 14

Number of columns by type:
	CATEGORICAL: 11 (78.5714%)
	NUMERICAL: 3 (21.4286%)

Columns:

CATEGORICAL: 11 (78.5714%)
	2: "condition" CATEGORICAL has-dict vocab-size:8 zero-ood-items most-frequent:"unknown" 49496 (39.5566%)
	3: "cylinders

Model trained in 0:01:59.186597
Compiling model...
Model compiled.
Training time in seconds: 
124.00913119316101


Model Predictions using Test Data

In [96]:
X_test['price'] = y_test
tf_dataset_test = tfdf.keras.pd_dataframe_to_tf_dataset(X_test, task = tf_core.Task.REGRESSION, label='price')
y_pred = model_1.predict(tf_dataset_test)



In [97]:
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"R-squared Score: {r2}")
print(f"Mean Squared Error: {mse}")


R-squared Score: 0.8105121848485068
Mean Squared Error: 18205289.963745516
