# Обучение моделей

## Импорт библиотек

In [1]:
import os, sys

PROJECT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))
print(PROJECT_DIR)
sys.path.append(PROJECT_DIR)

/workspaces/street_tree


In [2]:
import pandas as pd 
from src.config import PATH_RAW, PATH_PROCESSED, URL, NAME_DATASET, PATH_MODELS, output_path
from src.dowload_datafraime import download_file
from src.visualization import plot_corr_matrix
from src.preprocessing import (df_fillna, split_problems, convert_to_bool, encode_and_save_categorical, 
                               load_and_encode_categorical, split_and_save)


2025-03-27 17:12:12,951 - src.utils - INFO - Logger is ready
2025-03-27 17:12:12,956 - src.utils - INFO - This is a test log message.
2025-03-27 17:12:12,956 - src.utils - INFO - PROJECT_DIR: /workspaces/street_tree
2025-03-27 17:12:12,957 - src.utils - INFO - PATH_LOGS: /workspaces/street_tree/logs
2025-03-27 17:12:12,958 - src.utils - INFO - PATH_DATA: /workspaces/street_tree/data
2025-03-27 17:12:12,959 - src.utils - INFO - PATH_RAW: /workspaces/street_tree/data/raw
2025-03-27 17:12:12,959 - src.utils - INFO - PATH_PROCESSED: /workspaces/street_tree/data/processed
2025-03-27 17:12:12,960 - src.utils - INFO - PATH_MODELS: /workspaces/street_tree/models
2025-03-27 17:12:12,961 - src.utils - INFO - PATH_REPORTS: /workspaces/street_tree/reports
2025-03-27 17:12:12,962 - src.utils - INFO - PATH_DOCS: /workspaces/street_tree/docs
2025-03-27 17:12:12,962 - src.utils - INFO - PATH_SRC: /workspaces/street_tree/src
2025-03-27 17:12:12,963 - src.utils - INFO - output_path: /workspaces/street_t

## Предобработка

In [3]:
#download_file(URL,PATH_RAW, output_path)

In [4]:
df = pd.read_csv(f'{PATH_RAW}/{NAME_DATASET}.csv')
df = df.dropna(subset=['health'])

X_train, X_test, y_train, y_test = split_and_save(df.drop('health', axis=1), df.health, output_dir=PATH_PROCESSED, size=0.1, name_train='train.csv', name_test='test.csv')

df.shape

2025-03-27 17:12:33,344 - src.utils - INFO - Data successfully saved to: /workspaces/street_tree/data/processed
2025-03-27 17:12:33,347 - src.utils - INFO - Train data shape: (586954, 44)
2025-03-27 17:12:33,348 - src.utils - INFO - Test data shape: (65218, 44)


(652172, 45)

In [5]:
X_train.columns = [col.lower().replace(' ', '_') for col in X_train.columns]
df_drop_columns = ['status', 'state', 'tree_id', 'created_at', 'stump_diam', 'address', 'spc_common',
                   'borocode', 'x_sp', 'y_sp', 'council_district', 'census_tract', 'nta_name',
                   'bin', 'zip_city', 'community_board', 'bbl']
X_train = X_train.drop(columns=df_drop_columns)
X_train.shape


(586954, 27)

In [6]:
X_train = df_fillna(X_train)
X_train.shape

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['spc_latin'].fillna('No observation',inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['sidewalk'].fillna('NoDamage',inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting

(586954, 27)

In [7]:
X_train = split_problems(X_train, created_columns=False)
X_train.shape

2025-03-27 17:12:34,604 - src.utils - INFO - {'stones', 'metalgrates', 'branchlights', 'sneakers', 'rootother', 'branchother', 'trunklights', 'noproblem', 'trunkother', 'wiresrope'} - уникальные проблемы


(586954, 27)

In [8]:
X_train = convert_to_bool(X_train)
X_train.shape

2025-03-27 17:12:37,023 - src.utils - INFO - Значения преобразованы в булевые


(586954, 27)

In [9]:
categorical_columns = X_train.select_dtypes(include='object').columns.tolist()
categorical_columns

['spc_latin', 'steward', 'guards', 'user_type', 'borough', 'nta']

In [10]:
X_train, label_encoders = encode_and_save_categorical(X_train, categorical_columns, PATH_MODELS, 'label_encoders.pkl')
X_train.shape

(586954, 27)

In [11]:
# Вывод значений, закодированных энкодером для столбца 'health', вместе с соответствующими им числовыми кодами
health_classes = label_encoders['spc_latin'].classes_
health_codes = label_encoders['spc_latin'].transform(health_classes)

# Создание DataFrame для удобного отображения
health_mapping = pd.DataFrame({'spc_latin': health_classes, 'Code': health_codes})
print(health_mapping)

             spc_latin  Code
0                 Acer     0
1    Acer buergerianum     1
2       Acer campestre     2
3         Acer ginnala     3
4         Acer griseum     4
..                 ...   ...
128   Tsuga canadensis   128
129    Ulmus americana   129
130   Ulmus parvifolia   130
131       Ulmus pumila   131
132    Zelkova serrata   132

[133 rows x 2 columns]


## Обучим модели

In [12]:
# Разделим X_train на обучающую и валидационную выборки и y_train на соответствующие им значения
X_train, X_val, y_train, y_val = split_and_save(X_train, y_train, output_dir=PATH_PROCESSED, size=0.2, name_train='train.csv', name_test='valid.csv')

2025-03-27 17:12:43,120 - src.utils - INFO - Data successfully saved to: /workspaces/street_tree/data/processed
2025-03-27 17:12:43,121 - src.utils - INFO - Train data shape: (469563, 27)
2025-03-27 17:12:43,123 - src.utils - INFO - Test data shape: (117391, 27)


In [13]:
y_train.unique()

array(['Poor', 'Good', 'Fair'], dtype=object)

In [14]:
import joblib

# Define the mapping
target_mapping = {'Poor': 0, 'Fair': 1, 'Good': 2}

y_train = y_train.map(target_mapping)
y_val = y_val.map(target_mapping)

# Save the mapping to a file
joblib.dump(target_mapping, f'{PATH_MODELS}/label_encoders_target.pkl')

print("Target label encoder saved to label_encoders_target.pkl")

Target label encoder saved to label_encoders_target.pkl


In [15]:
from sklearn.preprocessing import StandardScaler
import joblib

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
# Save the scaler
joblib.dump(scaler, f'{PATH_MODELS}/scaler.pkl')
print("Scaler saved to scaler.pkl")

Scaler saved to scaler.pkl


In [16]:
X_train.shape

(469563, 27)

In [17]:
%load_ext autoreload
%autoreload 2

from src.modeling import TabularNN

# Instantiate the model
model = TabularNN(X_train, y_train, X_val, y_val, hidden_dims=[1024, 524, 128, 64], model_path=f'{PATH_MODELS}/tabular_model.pth')

# Train the model
model.train_model(epochs=1000, learning_rate=0.0001)



2025-03-27 17:12:50,247 - src.utils - INFO - Unique classes during training: [0 1 2]
2025-03-27 17:12:50,250 - src.utils - INFO - Output dimension during training: 3
2025-03-27 17:14:08,687 - src.utils - INFO - Epoch 1/1000 - Train Loss: 7792.5862, Val Loss: 1896.4180, LR: 0.000100
2025-03-27 17:14:08,730 - src.utils - INFO - Accuracy: 0.6011, F1-Score: 0.6549, AUC-ROC: 0.6590
2025-03-27 17:14:08,732 - src.utils - INFO - Confusion Matrix:
2120	945	1762
5092	4003	8276
17163	13585	64445
2025-03-27 17:14:08,887 - src.utils - INFO - Best model saved based on highest AUC-ROC.
2025-03-27 17:15:47,931 - src.utils - INFO - Epoch 2/1000 - Train Loss: 7629.7644, Val Loss: 1885.0213, LR: 0.000100
2025-03-27 17:15:47,932 - src.utils - INFO - Accuracy: 0.6139, F1-Score: 0.6637, AUC-ROC: 0.6604
2025-03-27 17:15:47,933 - src.utils - INFO - Confusion Matrix:
2277	801	1749
5518	3508	8345
16783	12123	66287
2025-03-27 17:15:47,943 - src.utils - INFO - Best model saved based on highest AUC-ROC.
2025-03-27

In [18]:
import pandas as pd
import numpy as np
from src.modeling import TabularNN
from src.config import PATH_MODELS
import torch

# Define the path to the saved model
MODEL_PATH = f"{PATH_MODELS}/tabular_model.pth"

# Load the checkpoint to get the parameters
checkpoint = torch.load(MODEL_PATH)
input_dim = checkpoint['input_dim']
hidden_dims = checkpoint['hidden_dims']
output_dim = checkpoint['output_dim']
target_mapping = checkpoint['target_mapping']

# Create dummy data with the correct shape
dummy_X = pd.DataFrame(np.zeros((3, input_dim)))
dummy_y = pd.Series([0, 1, 2]) 

# Initialize the model using dummy data
loaded_model = TabularNN(dummy_X, dummy_y, dummy_X, dummy_y, hidden_dims=hidden_dims)
loaded_model.load_state_dict(checkpoint['model_state_dict'])
loaded_model.eval()

print("Model loaded successfully!")

# Получим предсказания для dummy_X
loaded_model.predict(dummy_X)

loaded_model

2025-03-27 17:42:32,010 - src.utils - INFO - Unique classes during training: [0 1 2]
2025-03-27 17:42:32,011 - src.utils - INFO - Output dimension during training: 3


Model loaded successfully!


TabularNN(
  (model): Sequential(
    (0): Linear(in_features=27, out_features=1024, bias=True)
    (1): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=1024, out_features=524, bias=True)
    (5): BatchNorm1d(524, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.3, inplace=False)
    (8): Linear(in_features=524, out_features=128, bias=True)
    (9): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): Dropout(p=0.3, inplace=False)
    (12): Linear(in_features=128, out_features=64, bias=True)
    (13): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (14): ReLU()
    (15): Dropout(p=0.3, inplace=False)
    (16): Linear(in_features=64, out_features=3, bias=True)
  )
)

In [19]:
%load_ext autoreload
%autoreload 2
from src.modeling_catboost import CatBoostModelTrain

catboost_model = CatBoostModelTrain(model_dir=PATH_MODELS)
catboost_model.train(X_train, y_train, X_val, y_val)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


0:	learn: 1.0665870	test: 1.0666001	best: 1.0666001 (0)	total: 412ms	remaining: 6m 51s
100:	learn: 0.5502277	test: 0.5516981	best: 0.5516981 (100)	total: 34s	remaining: 5m 2s
200:	learn: 0.5351898	test: 0.5379727	best: 0.5379727 (200)	total: 1m 7s	remaining: 4m 27s
300:	learn: 0.5281540	test: 0.5319916	best: 0.5319916 (300)	total: 1m 41s	remaining: 3m 54s
400:	learn: 0.5230962	test: 0.5280030	best: 0.5280030 (400)	total: 2m 14s	remaining: 3m 20s
500:	learn: 0.5185345	test: 0.5245093	best: 0.5245093 (500)	total: 2m 47s	remaining: 2m 46s
600:	learn: 0.5147071	test: 0.5216839	best: 0.5216839 (600)	total: 3m 20s	remaining: 2m 12s
700:	learn: 0.5113475	test: 0.5193132	best: 0.5193132 (700)	total: 3m 52s	remaining: 1m 39s
800:	learn: 0.5084351	test: 0.5173442	best: 0.5173442 (800)	total: 4m 26s	remaining: 1m 6s
900:	learn: 0.5057840	test: 0.5156805	best: 0.5156805 (900)	total: 4m 59s	remaining: 32.9s


2025-03-27 17:48:04,985 - src.utils - INFO - Сохраняем модель в /workspaces/street_tree/models/catboost_model.cbm


999:	learn: 0.5035322	test: 0.5143287	best: 0.5143287 (999)	total: 5m 31s	remaining: 0us

bestTest = 0.5143287016
bestIteration = 999



2025-03-27 17:48:05,756 - src.utils - INFO - CatBoost Accuracy: 0.8163, F1-Score: 0.7479, AUC-ROC: 0.7500
2025-03-27 17:48:05,757 - src.utils - INFO - Confusion Matrix:
[[  197   392  4238]
 [  102   976 16293]
 [   37   500 94656]]
