In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/used_car_canada_clean.csv')
df.head()

Unnamed: 0,price,miles,year,make,model,body_type,vehicle_type,drivetrain,transmission,fuel_type,engine_size,engine_block,state
0,179999.0,9966.0,2017.0,acura,NSX,coupe,Car,4WD,Automatic,other,3.5,V,NB
1,179995.0,5988.0,2017.0,acura,NSX,coupe,Car,4WD,Automatic,other,3.5,V,QC
2,168528.0,24242.0,2017.0,acura,NSX,coupe,Car,4WD,Automatic,other,3.5,V,BC
3,220000.0,6637.0,2020.0,acura,NSX,coupe,Car,4WD,Automatic,other,3.5,V,ON
4,220000.0,6637.0,2020.0,acura,NSX,coupe,Car,4WD,Automatic,other,3.5,V,ON


In [3]:
cols_to_drop = ['body_type', 'vehicle_type', 'drivetrain', 'transmission', 'fuel_type', 'engine_block']
df = df.drop(cols_to_drop, axis=1)

In [4]:
df.head()

Unnamed: 0,price,miles,year,make,model,engine_size,state
0,179999.0,9966.0,2017.0,acura,NSX,3.5,NB
1,179995.0,5988.0,2017.0,acura,NSX,3.5,QC
2,168528.0,24242.0,2017.0,acura,NSX,3.5,BC
3,220000.0,6637.0,2020.0,acura,NSX,3.5,ON
4,220000.0,6637.0,2020.0,acura,NSX,3.5,ON


In [5]:
df_toyota_honda = df.loc[(df['make'] == 'honda') | (df['make'] == 'toyota')]

In [6]:
df_toyota_honda.to_csv('data/honda_toyota_ca.csv', index=False, header=True)

## Model 

In [7]:
df = pd.read_csv('data/honda_toyota_ca.csv')
df.head()

Unnamed: 0,price,miles,year,make,model,engine_size,state
0,4980.0,86132.0,2001.0,toyota,Prius,1.5,BC
1,18926.0,80516.0,2017.0,toyota,Prius,1.8,ON
2,23900.0,29295.0,2018.0,toyota,Prius,1.8,ON
3,27980.0,57894.0,2018.0,toyota,Prius,1.8,BC
4,22887.0,95106.0,2016.0,toyota,Prius,1.8,AB


In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(['price'], axis=1) # apart from price everything else is a feature
y = df['price'] #price is the target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=df[['make', 'model']], test_size=0.2, shuffle=True, random_state=42)

'''
This line of Python code is a crucial step in preparing a dataset for a machine learning model. It uses the train_test_split function from the popular scikit-learn library to divide your data into training and testing sets.
Here is a breakdown of what each part means:

X_train, X_test, y_train, y_test: These are the four variables that will store the split data.
X_train: The features (or independent variables) used to train the model.
X_test: The features reserved for testing the model's performance after it has been trained.
y_train: The target variable (or dependent variable) that corresponds to the X_train data.
y_test: The target variable that corresponds to the X_test data.

train_test_split(X, y, ...): This is the function call itself.
X: The input data
y: The output data

stratify=df[['make', 'model']]: This is a powerful and important parameter. 
It ensures that the proportions of car makes and models in the training set are the same as in the testing set.
For example, if your original dataset has 10% Toyota Corollas, both X_train and X_test will also contain approximately 10% Toyota Corollas. 
This is essential for ensuring your model doesn't learn from a biased sample and can generalize well to new data.

test_size=0.2: This specifies the size of the testing set. In this case, it means 20% of the data will be used for testing, and the remaining 80% will be used for training.

shuffle=True: This tells the function to randomly shuffle the data before splitting. This helps to prevent any order-related biases in your dataset from affecting the split.

random_state=42: This is a seed for the random number generator. 
By setting it to a fixed number (like 42), you ensure that the same random split is generated every time you run the code. 
This is crucial for reproducibility, allowing you to get the exact same results each time you run your script.
'''

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor

cat_index = [2,3,5]

cat_features_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder()),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", cat_features_transformer, cat_index)
    ]
)


model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", GradientBoostingRegressor(random_state=42))
    ]
)


'''
This code block is building a machine learning pipeline to streamline the process of preparing data and training a model. 
It's a best practice in data science because it ensures that all preprocessing steps are applied consistently to both your training and test data.

Here's a breakdown of what each part means:

1. The Core Components
The code starts by importing several key components from scikit-learn:

ColumnTransformer: 
This is used to apply different preprocessing steps to different columns of your data. 
It's a powerful tool when your dataset has a mix of data types, like categorical and numerical features.

Pipeline: 
This chains together multiple steps (like preprocessing and modeling) into a single, cohesive workflow.

OneHotEncoder: 
This is a preprocessing technique that converts categorical features (like car make and model names) into a numerical format that a machine learning model can understand.

GradientBoostingRegressor: 
This is the machine learning model itself. It's an advanced ensemble method often used for regression tasks (like predicting a numerical value such as car price).

2. The Preprocessing Pipeline
The cat_features_transformer and preprocessor sections handle the data preparation.

cat_index = [2, 3, 5]: This line defines the indices (column numbers) of the categorical features in your dataset. In this case, it's likely that the columns at index 2, 3, and 5 are things like car make, model, and maybe color or trim.
cat_features_transformer = Pipeline(...): This creates a small, dedicated pipeline just for handling categorical features. The only step in this pipeline is the OneHotEncoder, which will convert those features into a numerical format.

preprocessor = ColumnTransformer(...): This is the main preprocessing step. It tells the pipeline to:
Apply the cat_features_transformer (the one-hot encoder) only to the columns specified by cat_index.

Any other columns in your data will be left untouched by this preprocessor. This is perfect for numerical columns that don't need this specific type of encoding.

3. The Full Model Pipeline
The model section brings everything together.
model = Pipeline(...): This creates the complete machine learning workflow.
steps=[("preprocessor", preprocessor), ("regressor", GradientBoostingRegressor(...))]: This defines the sequence of operations.
The data first goes through the preprocessor step. This means the columns at indices 2, 3, and 5 will be one-hot encoded.
The preprocessed data is then fed directly into the regressor step, which is the GradientBoostingRegressor model.
GradientBoostingRegressor(random_state=42): This initializes the machine learning model. Setting random_state=42 ensures that the model will be trained in the exact same way every time the code is run, which is crucial for reproducibility.

In short, this code creates a single, unified object (model) that automatically handles both the one-hot encoding of specific columns and the training of a GradientBoostingRegressor. This makes your code cleaner, more organized, and prevents common data leakage errors that can occur when you apply preprocessing steps separately to your training and test data.
'''

In [11]:
model.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,loss,'squared_error'
,learning_rate,0.1
,n_estimators,100
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [12]:
model.score(X_test, y_test)

0.4865534075230413

In [13]:
from joblib import dump

dump(model, 'model/model.joblib')

['model/model.joblib']