In [45]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pickle
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource,  HoverTool, Legend
from bokeh.io import output_notebook
output_notebook()
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error,  mean_squared_error
import mlflow

## Создание Base-line модели

In [36]:
df = pd.read_pickle('../data/CleanCarData.pkl')
df

  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
0,ritz,2014,3.349609,5.589844,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.750000,9.539062,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.250000,9.851562,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.849609,4.148438,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.601562,6.871094,42450,Diesel,Dealer,Manual,0
...,...,...,...,...,...,...,...,...,...
296,city,2016,9.500000,11.601562,33988,Diesel,Dealer,Manual,0
297,brio,2015,4.000000,5.898438,60000,Petrol,Dealer,Manual,0
298,city,2009,3.349609,11.000000,87934,Petrol,Dealer,Manual,0
299,city,2017,11.500000,12.500000,9000,Diesel,Dealer,Manual,0


In [37]:
df = df.rename(columns={'Present_Price': 'target'})
df = df.drop(columns=['Year'])
df

  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,Car_Name,Selling_Price,target,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
0,ritz,3.349609,5.589844,27000,Petrol,Dealer,Manual,0
1,sx4,4.750000,9.539062,43000,Diesel,Dealer,Manual,0
2,ciaz,7.250000,9.851562,6900,Petrol,Dealer,Manual,0
3,wagon r,2.849609,4.148438,5200,Petrol,Dealer,Manual,0
4,swift,4.601562,6.871094,42450,Diesel,Dealer,Manual,0
...,...,...,...,...,...,...,...,...
296,city,9.500000,11.601562,33988,Diesel,Dealer,Manual,0
297,brio,4.000000,5.898438,60000,Petrol,Dealer,Manual,0
298,city,3.349609,11.000000,87934,Petrol,Dealer,Manual,0
299,city,11.500000,12.500000,9000,Diesel,Dealer,Manual,0


In [38]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1), df['target'], test_size=0.25, random_state=2)

In [39]:
cat_features = X_train.select_dtypes(include=['category','object']).columns.to_list()
cat_features

['Car_Name', 'Fuel_Type', 'Selling_type', 'Transmission']

In [40]:
num_features = X_train.select_dtypes(include=['number']).columns.to_list()
num_features

['Selling_Price', 'Driven_kms', 'Owner']

In [41]:
s_scaler = StandardScaler()
l_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=99999999) 
regressor = RandomForestRegressor()

In [42]:
# Для удобной работы со столбцами
preprocessor = ColumnTransformer(
    transformers=[
        ('num', s_scaler, num_features),  # преобразования для числовых признаков
        ('cat', l_encoder, cat_features), # преобразования для категориальных признаков
    ],
    remainder='drop' ) # Удаляем столбцы, которые не затронуты преобразования

In [43]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor), 
                           ('model', regressor)])

pipeline.fit(X_train, y_train)

In [44]:
predictions = pipeline.predict(X_test) 

metrics = {}
metrics["mae"] = mean_absolute_error(y_test, predictions)   
metrics["mape"] = mean_absolute_percentage_error(y_test, predictions)
metrics["mse"] = mean_squared_error(y_test, predictions)

metrics


{'mae': np.float64(1.3623642663788376),
 'mape': np.float64(0.9051489037186355),
 'mse': np.float64(3.607699767238424)}

## Логируем с MLFlow

In [46]:
# Работаем с MLflow локально
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

registry_uri = f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}"
tracking_uri = f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}"

mlflow.set_tracking_uri(tracking_uri)   
mlflow.set_registry_uri(registry_uri)   

In [48]:
# название тестового эксперимента, запуска (run) внутри него, имени, под которым модель будет регистрироваться
EXPERIMENT_NAME = "Predict_price_project"
RUN_NAME = "baseline_model"
REGISTRY_MODEL_NAME = "car_model_rf"

In [54]:
# Обязательно логируем сигнатуру модели и пример входных данных. Подготовим их
from mlflow.models import infer_signature

signature =  infer_signature(model_input = X_train.head(5))
input_example = X_train.head(5)
req_file = '../requirements.txt'
# Параметры, котороые будут залогированы, можем задавать вручную или полностью взять из модели
params_dict = pipeline.get_params()






In [61]:
## Когда создаем новый эксперимент, то: 
#experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)

# Впоследствии. чтобы добавлять запуски в этот же эксепримент мы должны получить его id:
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    # получаем уникальный идентификатор запуска эксперимента
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(pipeline, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             pip_requirements=req_file
                             )
    mlflow.log_metrics(metrics)
    mlflow.log_params(params_dict)

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')

2024/10/22 18:22:14 INFO mlflow.tracking._tracking_service.client: 🏃 View run baseline_model at: http://127.0.0.1:5000/#/experiments/1/runs/3a9238cf2c3242b488318e9b3b7dc5f9.
2024/10/22 18:22:14 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.
