In [236]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import Ridge, LinearRegression

In [7]:
train_bonds_data = pd.read_csv("../api/.cache2/bonds.csv")

In [8]:
new_bonds = pd.read_csv('../api/.cache/bonds.csv')

In [9]:
prices = {figi: price for figi, price in new_bonds[["figi", "candle_price"]].values}

In [10]:
def validate_prices(old_price, new_price):
    return new_price - old_price

In [40]:
def transform_bonds_data(dataset: pd.DataFrame):
    new_dataset = dataset.drop(["ticker", "isin"], axis=1)
    new_dataset.maturity_date = pd.to_datetime(new_dataset.maturity_date, format="%d/%m/%Y")
    new_dataset.maturity_date = new_dataset.maturity_date - pd.to_datetime("today")
    new_dataset.maturity_date = new_dataset.maturity_date.apply(lambda x: x.days)
    new_dataset = new_dataset.loc[np.isin(new_dataset.figi, new_bonds.figi)]

    new_dataset["target"] = new_dataset[["figi", "candle_price"]].apply(
        lambda x:
        validate_prices(x.candle_price, prices[x.figi]), axis=1
    )
    return new_dataset.drop(
        ["figi", "country_of_risk_name", "price", "rub_price", "name"],
        axis=1
    ).dropna()

In [41]:
processed_data = transform_bonds_data(train_bonds_data)

In [61]:
def transform_and_merge(df):
    columns_to_encode = df.select_dtypes(include=['object']).columns
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    scaler = StandardScaler()

    encoded_features = encoder.fit_transform(df[columns_to_encode])
    encoded_columns = encoder.get_feature_names_out(columns_to_encode)
    encoded_df = pd.DataFrame(encoded_features, columns=encoded_columns, index=df.index)

    df_remaining = df.drop(columns=columns_to_encode)

    numeric_columns = df_remaining.select_dtypes(include=['int64', 'float64']).columns

    if not numeric_columns.empty:
        scaled_features = scaler.fit_transform(df_remaining[numeric_columns])
        scaled_df = pd.DataFrame(scaled_features, columns=numeric_columns, index=df.index)
    else:
        scaled_df = pd.DataFrame(index=df.index)

    result_df = pd.concat([scaled_df, encoded_df], axis=1)

    return result_df

In [75]:
train = transform_and_merge(
    processed_data
)
train["target"] = processed_data["target"]

In [76]:
X_train, X_test, y_train, y_test = train_test_split(
    train.drop("target", axis=1),
    train["target"],
    test_size=0.1,
    random_state=42
)

In [159]:
model = Ridge()

In [160]:
model.fit(X_train, y_train)

In [161]:
y_pred = model.predict(X_test)

In [162]:
r2_score(y_test, y_pred)

0.715348356910481

In [163]:
mean_squared_error(y_test, y_pred, squared=False)

13.339866462119074

In [164]:
y_pred_classes = np.sign(y_pred) == np.sign(y_test)

In [165]:
y_pred_classes.mean()

0.5555555555555556

In [166]:
print(classification_report(np.sign(y_test), np.sign(y_pred), zero_division=False))

              precision    recall  f1-score   support

        -1.0       0.69      0.62      0.66        72
         0.0       0.00      0.00      0.00         2
         1.0       0.35      0.44      0.39        34

    accuracy                           0.56       108
   macro avg       0.35      0.36      0.35       108
weighted avg       0.57      0.56      0.56       108



In [168]:
corred_columns = train.corr()[((abs(train.corr()["target"])) > 0.05)].index

In [169]:
X_train_corred = X_train[list(set(corred_columns) - {"target"})]

In [170]:
corr_model = Ridge()

In [171]:
corr_model.fit(X_train_corred, y_train)

In [172]:
y_pred_corr = corr_model.predict(X_test[list(X_train_corred.columns)])

In [173]:
r2_score(y_test, y_pred_corr)

0.687880344016166

In [237]:
mean_squared_error(y_test, y_pred_corr, squared=False)

13.968674400879747

In [175]:
y_pred_classes_corr = np.sign(y_pred_corr) == np.sign(y_test)

In [176]:
y_pred_classes_corr.mean()

0.5555555555555556

In [177]:
print(classification_report(np.sign(y_test), np.sign(y_pred_corr), zero_division=False))

              precision    recall  f1-score   support

        -1.0       0.70      0.61      0.65        72
         0.0       0.00      0.00      0.00         2
         1.0       0.36      0.47      0.41        34

    accuracy                           0.56       108
   macro avg       0.35      0.36      0.35       108
weighted avg       0.58      0.56      0.56       108



In [178]:
model.coef_

array([  1.30797571,   0.25477732,  -0.03677804,  -0.02580512,
        -0.62177028,  -0.34134388,   0.15088905,  -1.94760812,
         0.61719262,   0.56561267,   1.81283113,  -0.44784671,
         0.02813038,  -0.97300801,  -5.18480115,   0.50883255,
         0.53170553,   1.05502174,  -1.44925415,  -1.51307077,
        -9.73284365,   6.22298345,   5.73179405,   2.19924259,
         5.31610491,   0.41627778, -19.88640278,   2.61552037,
        -2.61552037,   0.04125366,   0.07519407,  -0.11644774,
        -3.39012109,   3.39012109,  -0.95007015,   1.25301381,
         0.31493105,   3.16761027,   2.41200216,  -0.48403667,
        -3.4165542 ,   1.92191319,  -0.54444481,   0.48833458,
        -1.6835277 ,  -1.1725896 ,  -1.30658192,   0.58689397,
        -0.58689397])

In [179]:
pd.DataFrame([X_train.columns, abs(model.coef_)]).T.sort_values(1, ascending=False)

Unnamed: 0,0,1
26,class_code_TQRD,19.886403
20,candle_price,9.732844
21,class_code_TQCB,6.222983
22,class_code_TQIR,5.731794
24,class_code_TQOB,5.316105
14,inverse_drawdown,5.184801
40,sector_it,3.416554
33,country_of_risk_RU,3.390121
32,country_of_risk_KZ,3.390121
37,sector_government,3.16761


In [180]:
pd.DataFrame([X_train_corred.columns, abs(corr_model.coef_)]).T.sort_values(1, ascending=False)

Unnamed: 0,0,1
14,candle_price,8.830526
16,inverse_drawdown,5.744127
0,sharpe_ratio,2.297417
12,sector_materials,2.11986
11,price_rating,1.673819
10,ratings,1.419766
9,exchange_MOEX,1.37121
5,coupon_quantity_per_year,1.348521
15,stability,0.77207
4,sortino_ratio,0.729386


In [238]:
rf = RandomForestRegressor()

In [239]:
rf.fit(X_train, y_train)

In [240]:
y_pred_rf = rf.predict(X_test)

In [241]:
r2_score(y_test, y_pred_rf)

0.5500191465241009

In [242]:
mean_squared_error(y_test, y_pred_rf, squared=False), mean_absolute_error(y_test, y_pred_rf)

(16.772262550256464, 5.222013953703703)

In [222]:
y_pred_classes_rf = np.sign(y_pred_rf) == np.sign(y_test)

In [223]:
y_pred_classes_rf.mean()

0.7592592592592593

In [224]:
print(classification_report(np.sign(y_test), np.sign(y_pred_rf), zero_division=False))

              precision    recall  f1-score   support

        -1.0       0.75      0.97      0.85        72
         0.0       0.00      0.00      0.00         2
         1.0       0.80      0.35      0.49        34

    accuracy                           0.76       108
   macro avg       0.52      0.44      0.45       108
weighted avg       0.75      0.76      0.72       108



In [225]:
pd.DataFrame([X_train.columns, rf.feature_importances_]).T.sort_values(1, ascending=False)

Unnamed: 0,0,1
20,candle_price,0.187086
14,inverse_drawdown,0.178977
18,price_rating,0.159346
19,ratings,0.10929
1,maturity_date,0.093996
5,aci_value,0.064612
11,company_rating,0.060935
6,issue_size,0.047164
7,issue_size_plan,0.035694
41,sector_materials,0.01182


In [226]:
rf_corr = RandomForestRegressor()

In [227]:
rf.fit(X_train_corred, y_train)

In [228]:
y_pred_rf_corr = rf.predict(X_test[list(X_train_corred.columns)])

In [229]:
r2_score(y_test, y_pred_rf_corr)

0.5259978608231077

In [230]:
mean_squared_error(y_test, y_pred_rf_corr, squared=False)

17.214118387704477

In [231]:
y_pred_classes_rf_corr = np.sign(y_pred_rf_corr) == np.sign(y_test)

In [232]:
y_pred_classes_rf_corr.mean()

0.7314814814814815

In [233]:
print(classification_report(np.sign(y_test), np.sign(y_pred_rf_corr), zero_division=False))

              precision    recall  f1-score   support

        -1.0       0.74      0.94      0.83        72
         0.0       0.00      0.00      0.00         2
         1.0       0.69      0.32      0.44        34

    accuracy                           0.73       108
   macro avg       0.48      0.42      0.42       108
weighted avg       0.71      0.73      0.69       108

