In [26]:
# нью рок нью джаз
import numpy as np
import pandas as pd

In [70]:
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

imputer = SimpleImputer(strategy="most_frequent")

In [71]:
path_train = "train"

wag_prob = pd.read_parquet(path_train + "/wagons_probeg_ownersip.parquet").convert_dtypes()
wag_param = pd.read_parquet(path_train + "/wag_params.parquet").convert_dtypes()
target = pd.read_csv(path_train +'/target/y_train.csv').convert_dtypes()
to_predict = pd.read_csv(path_train +'/target/y_predict.csv').convert_dtypes()

In [72]:
train_target = target[target["month"] == "2022-12-01"].drop("month", axis=1)
train = wag_prob[wag_prob["repdate"] < "2022-12-01"]

features = train.groupby("wagnum").agg(
    {
        "ost_prob": ["mean", "last", "max"],
        "manage_type": "last",
        "rod_id": "last",
        "reestr_state": "last",
    }
)
features.columns = ["_".join(col).strip() for col in features.columns.values]
features = features.reset_index()

m_train = wag_param.drop(
    ["model", "tipvozd", "date_build", "srok_sl", "date_iskl"], axis=1
)

merged = features.merge(m_train, on="wagnum", how="inner")
merged = train_target.merge(merged, on="wagnum", how="left")


column_names = merged.columns

merged = imputer.fit_transform(merged)

merged = pd.DataFrame(merged, columns=column_names)

X_train = merged.drop(["target_day", "wagnum"], axis=1)
y_name = "target_month"

In [73]:
logreg = LogisticRegression()
logreg.fit(X_train.drop(y_name, axis=1), X_train[y_name])

rf = RandomForestClassifier()
rf.fit(X_train.drop(y_name, axis=1), X_train[y_name])

In [74]:
test_target = target[target["month"] == "2023-01-01"].drop("month", axis=1)
test = wag_prob[wag_prob["repdate"] < "2023-01-01"]

test_features = test.groupby("wagnum").agg(
    {
        "ost_prob": ["mean", "last", "max"],
        "manage_type": "last",
        "rod_id": "last",
        "reestr_state": "last",
    }
)
test_features.columns = ["_".join(col).strip() for col in test_features.columns.values]
test_features = test_features.reset_index()

m_test = wag_param.drop(
    ["model", "tipvozd", "date_build", "srok_sl", "date_iskl"], axis=1
)

merged = test_features.merge(m_test, on="wagnum", how="inner")
merged = test_target.merge(merged, on="wagnum", how="left")
column_names = merged.columns
merged = imputer.fit_transform(merged)
merged = pd.DataFrame(merged, columns=column_names)

X_test = merged.drop(["target_day", "wagnum"], axis=1)

In [75]:
y_pred_logreg = logreg.predict(X_test.drop(y_name, axis=1))
y_pred_rf = rf.predict(X_test.drop(y_name, axis=1))

f1_logreg = f1_score(X_test[y_name], y_pred_logreg)
print(f"Logistic Regression F1 Score: {f1_logreg}")

f1_rf = f1_score(X_test[y_name], y_pred_rf)
print(f"Random Forest F1 Score: {f1_rf}")

Logistic Regression F1 Score: 0.027972027972027975
Random Forest F1 Score: 0.48882374496152436


In [76]:
import h2o
import pandas as pd
from h2o.automl import H2OAutoML

h2o.init()

train = h2o.H2OFrame(X_train)
test = h2o.H2OFrame(X_test)

x = train.columns
x.remove(y_name)

aml = H2OAutoML(max_models=15, seed=42)
aml.train(x=x, y=y_name, training_frame=train)

lb = aml.leaderboard
print(lb.head(rows=lb.nrows))

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "17.0.7" 2023-04-18; OpenJDK Runtime Environment (build 17.0.7+7-Ubuntu-0ubuntu122.04.2); OpenJDK 64-Bit Server VM (build 17.0.7+7-Ubuntu-0ubuntu122.04.2, mixed mode, sharing)
  Starting server from /home/linreg/.local/lib/python3.10/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpsvfwmq1w
  JVM stdout: /tmp/tmpsvfwmq1w/h2o_linreg_started_from_python.out
  JVM stderr: /tmp/tmpsvfwmq1w/h2o_linreg_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,Europe/Moscow
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.42.0.3
H2O_cluster_version_age:,2 months and 19 days
H2O_cluster_name:,H2O_from_python_linreg_csnson
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.865 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
11:02:24.942: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.

██
11:02:36.718: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.


11:02:38.781: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.

██
11:02:48.435: _response param, We have detected that your response column has only 2 

In [77]:
y_pred = aml.predict(test).as_data_frame().squeeze()
y_pred_labels = (y_pred > 0.4).astype(int)
result = f1_score(X_test[y_name], y_pred_labels)
print(f"F1: {result}")

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
F1: 0.5753688261706221


In [None]:
h2o.save_model(aml.leader, path=f"saved_models/h20_{result}")
h2o.shutdown()

In [33]:
feature_importance = logreg.coef_

avg_feature_importance = np.mean(np.abs(feature_importance), axis=0)

feature_importance_df = pd.DataFrame(
    {"Feature": X_train.columns, "Importance": avg_feature_importance}
)

feature_importance_df = feature_importance_df.sort_values(
    by="Importance", ascending=False
)

print(feature_importance_df)

                Feature  Importance
9          cnsi_volumek    0.005600
13       cnsi_probeg_kr    0.002640
12       cnsi_probeg_dr    0.001805
10                 tara    0.001550
7                  gruz    0.001122
8    cnsi_gruz_capacity    0.001122
14                kuzov    0.000475
17              tippogl    0.000218
1         ost_prob_last    0.000077
3           rod_id_last    0.000074
6                rod_id    0.000074
15               telega    0.000035
19            ownertype    0.000020
16               tormoz    0.000011
0         ost_prob_mean    0.000009
18             norma_km    0.000007
2      manage_type_last    0.000005
4     reestr_state_last    0.000002
11          zavod_build    0.000002
5   ownership_type_last    0.000000


In [None]:
feature_importance = rf.feature_importances_

feature_importance_df = pd.DataFrame(
    {"Feature": X_train.columns, "Importance": feature_importance}
)

feature_importance_df = feature_importance_df.sort_values(
    by="Importance", ascending=False
)

print(feature_importance_df)

In [22]:
import shap

explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_train)
shap.summary_plot(shap_values, X_train, plot_type="bar")