In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer # т.н. преобразователь колонок
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import root_mean_squared_error
import numpy as np
import matplotlib.pyplot as plt
import pickle
import mlflow
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from mlflow.models import infer_signature
from sklearn.model_selection import GridSearchCV

In [3]:
def preprocessing_data_frame(frame):
    df = frame.copy()
    cat_columns = ['Make', 'Model', 'Style', 'Fuel_type', 'Transmission']
    num_columns = ['Year', 'Distance', 'Engine_capacity(cm3)', 'Price(euro)']
    
    question_dist = df[(df.Year <2021) & (df.Distance < 1100)]
    df = df.drop(question_dist.index)
    # Анализ и очистка данных
    # анализ гистограмм
    question_dist = df[(df.Distance > 1e6)]
    df = df.drop(question_dist.index)
    
    # здравый смысл
    question_engine = df[df["Engine_capacity(cm3)"] < 200]
    df = df.drop(question_engine.index)
    
    # здравый смысл
    question_engine = df[df["Engine_capacity(cm3)"] > 5000]
    df = df.drop(question_engine.index)
    
    # здравый смысл
    question_price = df[(df["Price(euro)"] < 101)]
    df = df.drop(question_price.index)
    
    # анализ гистограмм
    question_price = df[df["Price(euro)"] > 1e5]
    df = df.drop(question_price.index)
    
    #анализ гистограмм
    question_year = df[df.Year < 1971]
    df = df.drop(question_year.index)
    
    df = df.reset_index(drop=True)  # обновим индексы в датафрейме DF. если бы мы прописали drop = False, то была бы еще одна колонка - старые индексы
    # Разделение данных на признаки и целевую переменную
    
    
    # Предварительная обработка категориальных данных
    # Порядковое кодирование. Обучение, трансформация и упаковка в df
    
    ordinal = OrdinalEncoder()
    ordinal.fit(df[cat_columns]);
    Ordinal_encoded = ordinal.transform(df[cat_columns])
    df_ordinal = pd.DataFrame(Ordinal_encoded, columns=cat_columns)
    df[cat_columns] = df_ordinal[cat_columns]
    return df

def scale_frame(frame):
    df = frame.copy()
    X,y = df.drop(columns = ['Price(euro)']), df['Price(euro)']
    scaler = StandardScaler()
    power_trans = PowerTransformer()
    X_scale = scaler.fit_transform(X.values)
    Y_scale = power_trans.fit_transform(y.values.reshape(-1,1))
    return X_scale, Y_scale, power_trans

In [4]:
df = pd.read_csv('https://raw.githubusercontent.com/dayekb/Basic_ML_Alg/main/cars_moldova_no_dup.csv', delimiter = ',')
df.head()

Unnamed: 0,Make,Model,Year,Style,Distance,Engine_capacity(cm3),Fuel_type,Transmission,Price(euro)
0,Toyota,Prius,2011,Hatchback,195000.0,1800.0,Hybrid,Automatic,7750.0
1,Renault,Grand Scenic,2014,Universal,135000.0,1500.0,Diesel,Manual,8550.0
2,Volkswagen,Golf,1998,Hatchback,1.0,1400.0,Petrol,Manual,2200.0
3,Renault,Laguna,2012,Universal,110000.0,1500.0,Diesel,Manual,6550.0
4,Opel,Astra,2006,Universal,200000.0,1600.0,Metan/Propan,Manual,4100.0


In [5]:
df_proc = preprocessing_data_frame(df)
X,Y, power_trans = scale_frame(df_proc)
# разбиваем на тестовую и валидационную выборки
X_train, X_val, y_train, y_val = train_test_split(X, Y,
                                                  test_size=0.3,
                                                  random_state=42)

In [6]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

In [7]:
params = {'alpha': [0.0001, 0.001, 0.01, 0.05, 0.1 ],
      'l1_ratio': [0.001, 0.05, 0.01, 0.2]
 }
with mlflow.start_run():

    lr = SGDRegressor(random_state=42)
    clf = GridSearchCV(lr, params, cv = 5)
    clf.fit(X_train, y_train.reshape(-1))
    best = clf.best_estimator_
    y_pred = best.predict(X_val)
    y_price_pred = power_trans.inverse_transform(y_pred.reshape(-1,1))
    (rmse, mae, r2)  = eval_metrics(power_trans.inverse_transform(y_val), y_price_pred)
    alpha = best.alpha
    l1_ratio = best.l1_ratio
    mlflow.log_param("alpha", alpha)
    mlflow.log_param("l1_ratio", l1_ratio)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    
    predictions = best.predict(X_train)
    signature = infer_signature(X_train, predictions)
    mlflow.sklearn.log_model(lr, "model", signature=signature)

### Задание. Лабораторная работа №2

Провести обучение модели на своей обучающей выборке с трекингом параметров обучения (5 баллов) и записью метрик в mlflow (5 баллов).
Провести сравнение запусков обучения и обосновать выбор модели, результаты обучения (5 баллов). 

In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
import matplotlib.pyplot as plt
import pickle
import mlflow
import mlflow.sklearn
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from mlflow.models import infer_signature
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [38]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Предобработка данных

In [39]:
cat_columns = ['Name', 'Ticket', 'Cabin', 'Embarked']
num_columns = ['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']

# Анализ и очистка данных
df['Sex'] = df['Sex'].map({'male': 1, 'female': 0})

# Заполнение пропусков в столбце 'Age' медианой
df['Age'] = df['Age'].fillna(df['Age'].median())

# Заполнение пропусков в столбце 'Embarked' наиболее частым значением
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# Заполнение пропусков в столбце 'Fare' средним значением
df['Fare'] = df['Fare'].fillna(df['Fare'].mean())

# Заполнение пропусков в столбце 'Cabin' значением "Unknown"
df['Cabin'] = df['Cabin'].fillna('Unknown')

# Перезапуск индексов в датафрейме
df = df.reset_index(drop=True)

# Предварительная обработка категориальных данных
ordinal = OrdinalEncoder()
ordinal.fit(df[cat_columns])
Ordinal_encoded = ordinal.transform(df[cat_columns])
df_ordinal = pd.DataFrame(Ordinal_encoded, columns=cat_columns)
df[cat_columns] = df_ordinal[cat_columns]

In [40]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,108.0,1,22.0,1,0,523.0,7.25,147.0,2.0
1,2,1,1,190.0,0,38.0,1,0,596.0,71.2833,81.0,0.0
2,3,1,3,353.0,0,26.0,0,0,669.0,7.925,147.0,2.0
3,4,1,1,272.0,0,35.0,1,0,49.0,53.1,55.0,2.0
4,5,0,3,15.0,1,35.0,0,0,472.0,8.05,147.0,2.0


In [41]:
# Разделение данных на признаки и целевую переменную
X, y = df.drop(columns=['Survived']), df['Survived']

# Стандартизация числовых признаков
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Разделение данных на обучающую и валидационную выборки
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

### Обучение

In [42]:
# Пример входных данных (из обучающей выборки X_train)
input_example = X_train[:5]  # Пример первых 5 строк из обучающего набора

# Функция для вычисления метрик и логирования модели
def log_metrics_and_params(model_name, model, X_val, y_val, params, input_example):
    y_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    
    mlflow.log_param("model", model_name)
    for k, v in params.items():
        mlflow.log_param(k, v)
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("precision", prec)
    mlflow.log_metric("recall", rec)
    mlflow.log_metric("f1_score", f1)
    
    # Логирование модели с примером входных данных
    mlflow.sklearn.log_model(model, "model", input_example=input_example)

In [43]:
# Настройка эксперимента
mlflow.set_experiment("Titanic_Classification")

# 1. SGDClassifier (с GridSearchCV)
param_grid = {
    "alpha": [0.0001, 0.001, 0.01, 0.02, 0.1, 0.2],
    "penalty": ["l2", "l1"],
    "loss": ["log_loss"]
}

with mlflow.start_run(run_name="SGDClassifier"):
    grid_search = GridSearchCV(SGDClassifier(random_state=42, max_iter=1000), param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    best_sgd = grid_search.best_estimator_
    log_metrics_and_params("SGDClassifier", best_sgd, X_val, y_val, grid_search.best_params_, input_example)

    print("SGDClassifier")
    print(f"Best params: {grid_search.best_params_}")
    print(f"Accuracy: {accuracy:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}")

# 2. LogisticRegression (без GridSearch, с дефолтными параметрами)
with mlflow.start_run(run_name="LogisticRegression"):
    lr_model = LogisticRegression(max_iter=1000)
    lr_model.fit(X_train, y_train)
    log_metrics_and_params("LogisticRegression", lr_model, X_val, y_val, {"C": lr_model.C}, input_example)

    print("LogisticRegression")
    print(f"Best params: {grid_search.best_params_}")
    print(f"Accuracy: {accuracy:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}")

# 3. RandomForestClassifier (с дефолтными параметрами)
with mlflow.start_run(run_name="RandomForestClassifier"):
    rf_model = RandomForestClassifier(random_state=42)
    rf_model.fit(X_train, y_train)
    log_metrics_and_params("RandomForestClassifier", rf_model, X_val, y_val, {"n_estimators": rf_model.n_estimators}, input_example)

    print("RandomForestClassifier")
    print(f"Best params: {grid_search.best_params_}")
    print(f"Accuracy: {accuracy:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}")

Downloading artifacts: 100%|███████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 1239.14it/s]


SGDClassifier
Best params: {'alpha': 0.01, 'loss': 'log_loss', 'penalty': 'l2'}
Accuracy: 0.810, Precision: 0.788, Recall: 0.739, F1: 0.763


Downloading artifacts: 100%|███████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 1240.13it/s]


LogisticRegression
Best params: {'alpha': 0.01, 'loss': 'log_loss', 'penalty': 'l2'}
Accuracy: 0.810, Precision: 0.788, Recall: 0.739, F1: 0.763


Downloading artifacts: 100%|███████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 1304.89it/s]


RandomForestClassifier
Best params: {'alpha': 0.01, 'loss': 'log_loss', 'penalty': 'l2'}
Accuracy: 0.810, Precision: 0.788, Recall: 0.739, F1: 0.763


В итоге самой эффективной моделью стала LogisticRegression. Все модели получили один и тот же результат, но она сделала это за меньшее время.

### Работу выполнил

Смирнов Роман Евгеньевич РИ-230917