In [1]:
DATA_PATH = "../data/processed/02_cleaned_data.pkl"

MLFLOW_TRACKING_URI = "../models/mlruns"
MLFLOW_EXPERIMENT_NAME = "car_price_prediction"

LOG_PATH = "../models/temp/"
LOG_DATA_PKL = "data.pkl"
LOG_MODEL_PKL = "model.pkl"
LOG_METRICS_PKL = "metrics.pkl"

In [2]:
# Load packages
import pandas as pd

from pathlib import Path

from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

import mlflow
from mlflow.tracking import MlflowClient

pd.options.display.max_columns = 100

___
## Itialize
### Create directories:

In [3]:
Path(MLFLOW_TRACKING_URI).mkdir(parents=True, exist_ok=True)
Path(LOG_PATH).mkdir(parents=True, exist_ok=True)

___
## Read data:

In [4]:
df = pd.read_pickle(DATA_PATH)
df.sample(5)

Unnamed: 0,year,engine_hp,engine_cylinders,transmission_type,msrp,speed,acura,alfa_romeo,aston_martin,audi,bentley,bmw,bugatti,buick,cadillac,chevrolet,chrysler,dodge,ferrari,fiat,ford,genesis,gmc,honda,hummer,hyundai,infiniti,kia,lamborghini,land_rover,lexus,lincoln,lotus,maserati,maybach,mazda,mclaren,mercedes-benz,mitsubishi,nissan,oldsmobile,plymouth,pontiac,porsche,rolls-royce,saab,scion,spyker,subaru,suzuki,tesla,toyota,volkswagen,volvo
9011,26,285.0,8.0,2,29555,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4709,26,130.0,4.0,2,19925,35.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4521,27,282.0,6.0,2,35330,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1513,27,335.0,6.0,2,51945,23.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9960,25,308.0,6.0,2,48065,19.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


___
## Splitting data into train and test:

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('msrp', axis=1),
                                                    df.msrp,
                                                    test_size=0.2,
                                                    random_state=42)

In [6]:
# quick check of splitted data:
print("X_train shape", X_train.shape)
print("X_test shape", X_test.shape)
X_train.sample(5)

X_train shape (7487, 53)
X_test shape (1872, 53)


Unnamed: 0,year,engine_hp,engine_cylinders,transmission_type,speed,acura,alfa_romeo,aston_martin,audi,bentley,bmw,bugatti,buick,cadillac,chevrolet,chrysler,dodge,ferrari,fiat,ford,genesis,gmc,honda,hummer,hyundai,infiniti,kia,lamborghini,land_rover,lexus,lincoln,lotus,maserati,maybach,mazda,mclaren,mercedes-benz,mitsubishi,nissan,oldsmobile,plymouth,pontiac,porsche,rolls-royce,saab,scion,spyker,subaru,suzuki,tesla,toyota,volkswagen,volvo
4018,20,285.0,6.0,2,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9204,26,266.0,6.0,2,19.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
566,28,101.0,4.0,2,30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1027,26,220.0,4.0,3,27.5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7410,28,287.0,6.0,2,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


___ 
## Trying the base model:

### Intialize MLflow

In [7]:
# Intialize client and experiment
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()

In [8]:
# Check if experiment already exists; if not create one
try:
    mlflow.create_experiment(MLFLOW_EXPERIMENT_NAME)
except:
    print(f'Experiment "{MLFLOW_EXPERIMENT_NAME}" exists at "{mlflow.get_tracking_uri()}"')

Experiment "car_price_prediction" exists at "../models/mlruns"


In [9]:
exp = client.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)
print(exp)

<Experiment: artifact_location='../models/mlruns/1', experiment_id='1', lifecycle_stage='active', name='car_price_prediction', tags={}>


**Now constructing the baseline model**
### Linear Regression:

In [10]:
lr = make_pipeline(RobustScaler(),
                   LinearRegression())

lr.fit(X_train.values, y_train.values)

Pipeline(steps=[('robustscaler', RobustScaler()),
                ('linearregression', LinearRegression())])

In [15]:
# Evaluate on training set:
predictions = pd.DataFrame(lr.predict(X_train.values))
predictions

Unnamed: 0,0
0,42892.848897
1,54588.956639
2,28300.945691
3,46545.692127
4,67127.517068
...,...
7482,23231.735707
7483,46409.433311
7484,23764.558825
7485,38095.014251


In [16]:
y_train

4910    28330
8248    44100
6809    31025
6770    55230
1128    70950
        ...  
7387    22995
6692    41225
6945    24250
1197    29990
9408    33115
Name: msrp, Length: 7487, dtype: int64

In [18]:
mean_squared_error(y_train.values, predictions.values)

111766747.128854