In [None]:
!pip install -U "ray[default]" google-api-python-client

# **Install Ray - Test Code To execute Tasks on 2 CPUs**

In [30]:
from collections import Counter
import socket
import time

import ray
ray.shutdown()
ray.init()

print('''This cluster consists of
    {} nodes in total
    {} CPU resources in total
'''.format(len(ray.nodes()), ray.cluster_resources()['CPU']))

@ray.remote
def f():
    time.sleep(0.001)
    # Return IP address.
    return socket.gethostbyname(socket.gethostname())

object_ids = [f.remote() for _ in range(10000)]
ip_addresses = ray.get(object_ids)

print('Tasks executed')
for ip_address, num_tasks in Counter(ip_addresses).items():
    print('    {} tasks on {}'.format(num_tasks, ip_address))

2023-12-04 03:33:37,953	INFO worker.py:1664 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


This cluster consists of
    1 nodes in total
    2.0 CPU resources in total

Tasks executed
    10000 tasks on 172.28.0.12


In [None]:
!gcloud auth application-default login

In [None]:
!gcloud auth application-default set-quota-project divine-beanbag-406919

# **Create a Ray Cluster on GCP using config.yaml**

In [None]:
!ray up -y config.yaml

In [None]:
!ray exec config.yaml 'python -c "import ray; ray.init()"'

In [7]:
!ray status

Node status
---------------------------------------------------------------
Active:
 1 node_27f80b26f2c16bde1040cacb5c76a564cce90b194af90190b6ce2ded
Pending:
 (no pending nodes)
Recent failures:
 (no failures)

Resources
---------------------------------------------------------------
Usage:
 0.0/2.0 CPU
 0B/7.34GiB memory
 0B/3.67GiB object_store_memory

Demands:
 (no resource demands)
[0m

#Preprocessing and Model Training

In [31]:
#!/usr/bin/env python
# coding: utf-8

# Importing Zip file from Google Drive

# Reading Data into Pandas Dataframe

# In[2]:

from google.colab import drive
drive.mount('/content/drive')

import numpy as np
import matplotlib.pyplot as plt
import os
import cv2
from tqdm import tqdm


DATADIR = "/content/drive/MyDrive/vehicles.csv.zip"

import pandas as pd
import zipfile

csv_file_name = 'vehicles.csv'

# Create a ZipFile object and extract the CSV file
with zipfile.ZipFile(DATADIR, 'r') as zip_file:
    with zip_file.open(csv_file_name) as file:
        # Read the CSV file with pandas
        data = pd.read_csv(file)


# Display the first few rows to understand the data
data.columns
print(data.head(5))


# In[3]:


data.isna().sum()


# Data Preprocessing

# In[4]:


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

data_copy = data


data_types = data_copy.dtypes

# Print the data types for all columns
print(data_types)


# In[9]:


data_copy = data_copy.dropna(subset=['year', 'odometer', 'manufacturer', 'model'])


# In[10]:


data_copy.fillna('unknown', inplace=True)


# In[11]:


data_copy = data_copy.drop_duplicates()


# In[12]:


data_copy.shape


# In[13]:


manufacturer_values = data_copy['manufacturer'].value_counts()
data_copy['manufacturer'] =  data_copy['manufacturer'].apply(lambda x: x if str(x) in manufacturer_values[:20] else 'others')


# In[14]:


region_values = data_copy['region'].value_counts()
data_copy['region'] = data_copy['region'].apply(lambda x: x if str(x) in region_values[:50] else 'others')
model_values = data_copy['model'].value_counts()
data_copy['model'] = data_copy['model'].apply(lambda x: x if str(x) in model_values[:50] else 'others')


# In[16]:


price_percentile25 = data_copy['price'].quantile(0.25)
price_percentile75 = data_copy['price'].quantile(0.75)
price_iqr = price_percentile75 - price_percentile25
price_upper_limit = price_percentile75 + 1.5 * price_iqr
price_lower_limit = data_copy['price'].quantile(0.15)
new_df = data_copy[(data_copy['price'] < price_upper_limit) & (data_copy['price'] > price_lower_limit)]
odometer_percentile75 = data_copy['odometer'].quantile(0.75)
odometer_percentile25 = data_copy['odometer'].quantile(0.25)
odometer_iqr = odometer_percentile75 - odometer_percentile25
odometer_upper_limit = odometer_percentile75 + 1.5 * odometer_iqr
odometer_lower_limit = data_copy['odometer'].quantile(0.05)
new_df = new_df[(new_df['odometer'] < odometer_upper_limit) & (new_df['odometer'] > odometer_lower_limit)]


# In[20]:


new_df['odometer'] = new_df['odometer'].astype(int)
new_df['year'] = new_df['year'].astype(int)


# In[22]:


new_df = new_df[new_df['year'] > 1996]
new_df.shape
new_df['car_age'] = 2022 - new_df['year']
new_df.drop(['year'], axis = 1, inplace = True)


# Categorical to Numerical

# In[23]:


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer

pipe_categorical = Pipeline(
    steps = [('ordinal_encoder', OrdinalEncoder(categories = [[ 'salvage', 'fair', 'unknown', 'good', 'excellent', 'like new', 'new']])),
             ('one_hot_encoder', OneHotEncoder(sparse = False, drop = 'first'))]
)
pipe_numerical = Pipeline(
    steps = [('standard_scaler', StandardScaler())]
)
column_transformer = ColumnTransformer(transformers = [
    ('condition_pipe_trans', pipe_categorical['ordinal_encoder'], ['condition']),
    ('categorical_pipe_trans', pipe_categorical['one_hot_encoder'], ['model', 'region', 'manufacturer', 'fuel', 'cylinders','title_status', 'transmission', 'drive', 'type', 'paint_color']),
    ('numerical_pipe_trans', pipe_numerical, ['odometer'])
])


# Train & Test Data Split
#

# In[24]:


final_df = new_df.copy()


# In[ ]:


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(final_df.drop(['price'], axis = 1), final_df['price'], random_state = 42, test_size = .2)
X_train_tnf = column_transformer.fit_transform(X_train)
X_test_tnf = column_transformer.transform(X_test)


# Training the Random Forest Regressor

# In[29]:


# Create a Random Forest regressor
rf_regressor = RandomForestRegressor(n_estimators=150, random_state=0, min_samples_leaf=1, max_features=0.3, oob_score=True)
import time
start = time.time()

# Train the model
rf_regressor.fit(X_train_tnf, y_train)
end = time.time()
print("Training time")
print(end - start)


# Evaluation Model

# In[30]:


# Make predictions on the test set
y_pred = rf_regressor.predict(X_test_tnf)

# Calculate the Mean Squared Error and R-squared score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared Score: {r2}")



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
   region  price    year manufacturer                        model  condition  \
0  auburn  15000  2013.0         ford                    f-150 xlt  excellent   
1  auburn  30990  2019.0         ford   ranger supercrew xl pickup       good   
2  auburn  34590  2018.0         ford  f150 super cab xl pickup 4d       good   
3  auburn  38990  2020.0         ford       f150 supercrew cab xlt       good   
4  auburn  27990  2020.0         ford    ranger supercab xl pickup       good   

     cylinders   fuel  odometer title_status transmission drive    type  \
0  6 cylinders    gas  128000.0        clean    automatic   rwd   truck   
1          NaN  other    1834.0        clean        other   NaN  pickup   
2  6 cylinders    gas   20856.0        clean        other   NaN  pickup   
3  6 cylinders    gas   12231.0        clean        other   NaN  pickup   
4        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_copy.fillna('unknown', inplace=True)


Training time
36.211474657058716
Mean Squared Error: 28362706.905802667
R-squared Score: 0.8416423913489381


In [33]:

# Create a Random Forest regressor
rf_regressor = RandomForestRegressor(n_estimators=150, random_state=0, min_samples_leaf=1, max_features=0.3, n_jobs=-1, oob_score=True)
import time
import joblib
ray.shutdown()
ray.init()
start = time.time()
from ray.util.joblib import register_ray
register_ray()

# Train the model
for i in [1, 2]:
  start = time.time()
  with joblib.parallel_backend('ray', n_jobs = i):
    rf_regressor.fit(X_train_tnf, y_train)
  end = time.time()
  print("Number of CPUs: " + str(i))
  print("Training time")
  print(end - start)

  # Make predictions on the test set
  y_pred = rf_regressor.predict(X_test_tnf)

  # Calculate the Mean Squared Error and R-squared score
  mse = mean_squared_error(y_test, y_pred)
  r2 = r2_score(y_test, y_pred)

  print(f"Mean Squared Error: {mse}")
  print(f"R-squared Score: {r2}")


# Evaluation Model

# In[30]:





2023-12-04 03:36:42,538	INFO worker.py:1664 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


Number of CPUs: 1
Training time
45.13707733154297
Mean Squared Error: 28362706.905802667
R-squared Score: 0.8416423913489381
Number of CPUs: 2
Training time
43.77235007286072
Mean Squared Error: 28362706.905802667
R-squared Score: 0.8416423913489381
