In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import joblib
import warnings
from scipy.stats import uniform

from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Lasso

from custom_modules.parameters_search import RandomSearch

# suppress warnings
warnings.filterwarnings('ignore')

In [None]:
# embeddings dataset
df = pd.read_excel(r"data\df_embs_tiny.xlsx").iloc[:,1:]

# initial dataset with score labels
df_init = pd.read_excel(r"data\generated_and_bad_news.xlsx")

In [None]:
# set the weights for the original and augmented news items
def weights_for_generated(x, penalty):
    if x==1:
        y = penalty
    else:
        y = 1
    return y

In [None]:
# add the target variable and set the weights for the news items
weigth_penalty = df_init['is_generated'].apply(lambda x: weights_for_generated(x, 0.75)) # set the weight for the generated data to 0.75
impact = df_init['impact']
news_type = df_init['type']
df.insert(0,'type', news_type)
df.head(3)

Unnamed: 0,type,0,1,2,3,4,5,6,7,8,...,302,303,304,305,306,307,308,309,310,311
0,0,0.030118,-0.062606,-0.072773,-0.080754,-0.040619,0.003744,-0.033926,-0.056159,-0.019813,...,-0.024541,0.038457,0.0243,0.020128,0.023386,-0.039331,-0.026465,-0.054423,0.049977,-0.004566
1,0,0.044143,-0.010936,0.027072,-0.000567,0.025038,0.010279,-0.038225,-0.064649,-0.002913,...,0.069833,0.073745,-0.00404,0.023638,0.004948,0.000697,-0.046012,0.063804,0.019733,-0.015541
2,0,0.010953,-0.098086,-0.030484,-0.022409,-0.026681,0.033925,-0.022558,-0.018889,-0.019922,...,-0.000105,0.014918,0.028374,-0.009994,0.032575,-0.035325,0.013958,0.023161,0.05299,-0.039242


In [None]:
# split into test and validation sets w.r.t. stratification
X_train, X_test, y_train, y_test, weigth_penalty_train, weigth_penalty_test = train_test_split(df, impact, weigth_penalty, 
                                                    test_size=0.25,
                                                    random_state=0,
                                                    stratify = news_type)


In [None]:
# check the correctness of stratification in the splits (class distribution in percentages)
df['type'].value_counts(normalize=True)*100
X_train['type'].value_counts(normalize=True)*100
X_test['type'].value_counts(normalize=True)*100

X_train = X_train.drop('type', axis =1)
X_test = X_test.drop('type', axis =1)

# SVR

In [None]:
# model
model = SVR(cache_size=500)

hyperparameters = dict(kernel = ['linear','rbf', 'poly'],
                       degree = [3, 4, 5],
                       C = uniform(loc=0.1, scale=1), 
                       epsilon = uniform(loc=0.01, scale=0.1)
                       )
print('hyperparameters:', hyperparameters)

SVR_RandomSearch = RandomSearch(X_train, y_train, model, hyperparameters, n_iter=100, sample_weight=weigth_penalty_train)
Prediction_SVR = SVR_RandomSearch.model.predict(X_test)

hyperparameters: {'kernel': ['linear', 'rbf', 'poly'], 'degree': [3, 4, 5], 'C': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x00000256CB473C90>, 'epsilon': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x00000256C6A9D390>}
Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END C=0.517022004702574, degree=3, epsilon=0.10325573593386587, kernel=rbf; total time=   0.5s
[CV] END C=0.517022004702574, degree=3, epsilon=0.10325573593386587, kernel=rbf; total time=   0.4s
[CV] END C=0.517022004702574, degree=3, epsilon=0.10325573593386587, kernel=rbf; total time=   0.4s
[CV] END C=0.517022004702574, degree=3, epsilon=0.10325573593386587, kernel=rbf; total time=   0.4s
[CV] END C=0.517022004702574, degree=3, epsilon=0.10325573593386587, kernel=rbf; total time=   0.4s
[CV] END C=0.40233257263183975, degree=3, epsilon=0.01923385947687978, kernel=linear; total time=   0.9s
[CV] END C=0.40233257263183975, degree=3, epsilon=0.01923385947687

['regress_svr.pkl']

In [33]:
best_param_svr = SVR_RandomSearch.model.best_params_
best_param_svr

{'C': 0.8554630526024664,
 'degree': 3,
 'epsilon': 0.014930424475315776,
 'kernel': 'poly'}

# KNN

In [None]:
# model
model = KNeighborsRegressor()
hyperparameters = dict(weights = ['distance'], 
                       metric = ['cosine'],
                       algorithm = ['brute'],
                       n_neighbors = list(range(10,15)),
                       leaf_size = range(1,20)
                       )

print('hyperparameters:', hyperparameters)
KNN_RandomSearch = RandomSearch(X_train, y_train, model, hyperparameters, n_iter = 100)
Prediction_KNN = KNN_RandomSearch.model.predict(X_test)

hyperparameters: {'weights': ['distance'], 'metric': ['cosine'], 'algorithm': ['brute'], 'n_neighbors': [10, 11, 12, 13, 14], 'leaf_size': range(1, 20)}
Fitting 5 folds for each of 95 candidates, totalling 475 fits
[CV] END algorithm=brute, leaf_size=1, metric=cosine, n_neighbors=10, weights=distance; total time=   0.0s
[CV] END algorithm=brute, leaf_size=1, metric=cosine, n_neighbors=10, weights=distance; total time=   0.0s
[CV] END algorithm=brute, leaf_size=1, metric=cosine, n_neighbors=10, weights=distance; total time=   0.0s
[CV] END algorithm=brute, leaf_size=1, metric=cosine, n_neighbors=10, weights=distance; total time=   0.0s
[CV] END algorithm=brute, leaf_size=1, metric=cosine, n_neighbors=10, weights=distance; total time=   0.0s
[CV] END algorithm=brute, leaf_size=1, metric=cosine, n_neighbors=11, weights=distance; total time=   0.0s
[CV] END algorithm=brute, leaf_size=1, metric=cosine, n_neighbors=11, weights=distance; total time=   0.0s
[CV] END algorithm=brute, leaf_size=

['regress_knn_10n.pkl']

In [34]:
best_param_knn = KNN_RandomSearch.model.best_params_
best_param_knn

{'weights': 'distance',
 'n_neighbors': 10,
 'metric': 'cosine',
 'leaf_size': 1,
 'algorithm': 'brute'}

# Lasso

In [None]:
# model
model = Lasso()
hyperparameters = dict(alpha = uniform(loc=0.1, scale=2))

print('hyperparameters:', hyperparameters)
Lasso_RandomSearch = RandomSearch(X_train, y_train, model, hyperparameters)
Prediction_Lasso = Lasso_RandomSearch.model.predict(X_test)

mean_absolute_error(Prediction_Lasso,y_test)

hyperparameters: {'alpha': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x000001E521128E80>}
Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END ............................alpha=0.934044009405148; total time=   0.0s
[CV] END ............................alpha=0.934044009405148; total time=   0.0s
[CV] END ............................alpha=0.934044009405148; total time=   0.0s
[CV] END ............................alpha=0.934044009405148; total time=   0.0s
[CV] END ............................alpha=0.934044009405148; total time=   0.0s
[CV] END ...........................alpha=1.5406489868843163; total time=   0.0s
[CV] END ...........................alpha=1.5406489868843163; total time=   0.0s
[CV] END ...........................alpha=1.5406489868843163; total time=   0.0s
[CV] END ...........................alpha=1.5406489868843163; total time=   0.0s
[CV] END ...........................alpha=1.5406489868843163; total time=   0.0s
[CV] END .....

0.34772938608976184

# Stacking of regressions

In [None]:
# outliers search
lof = LocalOutlierFactor()
labels = lof.fit_predict(X_train, y_train)

np.where(labels == -1)

(array([], dtype=int64),)

In [36]:
best_param_svr
best_param_knn

{'weights': 'distance',
 'n_neighbors': 10,
 'metric': 'cosine',
 'leaf_size': 1,
 'algorithm': 'brute'}

In [None]:
# Declare models
knn_best = KNeighborsRegressor(**best_param_knn)
svr_best = SVR(cache_size=500, **best_param_svr)
#rf_params = {'max_depth': 50, 'n_estimators': 200, 'min_samples_split': 3, 'min_samples_leaf': 3, 'criterion': 'squared_error'} 
#rf = RandomForestRegressor(**rf_params)
svr_linear = SVR(kernel='linear')

# Create stacking models
estimators = [
    ('knn', knn_best),
    #('random forest', rf),
    ('svr', svr_linear)
]

stacking_model = StackingRegressor(estimators=estimators, final_estimator=svr_best)

# Defining parameters for search
param_grid = {
    'knn__n_neighbors': [10,11],
    #'rf': [],
    'svr__C': [0.5, 1, 1.2],
    'svr__epsilon': [0.01, 0.015, 0.02]
}

# GridSearchCV
grid_search = GridSearchCV(stacking_model, param_grid, scoring='neg_mean_absolute_error', cv=5)
grid_search.fit(X_train, y_train)

# Best params
best_params = grid_search.best_params_
print(f'Лучшие параметры: {best_params}')

# Best model
best_model = grid_search.best_estimator_

# Prediction on the test set
y_pred = best_model.predict(X_test)

# Model estimation
mae = mean_absolute_error(y_test, y_pred)
print(f'MAE: {mae}')


Лучшие параметры: {'knn__n_neighbors': 11, 'svr__C': 0.5, 'svr__epsilon': 0.02}
MAE: 0.11363978408502405


### Train on the entire dataset

In [40]:
df_full = df.copy().iloc[:,1:]
df_full

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,302,303,304,305,306,307,308,309,310,311
0,0.030118,-0.062606,-0.072773,-0.080754,-0.040619,0.003744,-0.033926,-0.056159,-0.019813,0.029078,...,-0.024541,0.038457,0.024300,0.020128,0.023386,-0.039331,-0.026465,-0.054423,0.049977,-0.004566
1,0.044143,-0.010936,0.027072,-0.000567,0.025038,0.010279,-0.038225,-0.064649,-0.002913,0.009922,...,0.069833,0.073745,-0.004040,0.023638,0.004948,0.000697,-0.046012,0.063804,0.019733,-0.015541
2,0.010953,-0.098086,-0.030484,-0.022409,-0.026681,0.033925,-0.022558,-0.018889,-0.019922,0.102335,...,-0.000105,0.014918,0.028374,-0.009994,0.032575,-0.035325,0.013958,0.023161,0.052990,-0.039242
3,0.116889,-0.033393,-0.021454,-0.031841,-0.013085,-0.014946,-0.033964,-0.092080,-0.039061,0.027411,...,-0.030880,-0.012885,-0.020651,0.038166,-0.021528,0.001962,0.039432,0.005102,0.066790,0.038240
4,-0.038224,-0.104240,-0.027412,-0.077668,-0.038780,0.058891,-0.001868,-0.017486,-0.007074,0.038099,...,-0.027721,0.063511,-0.021798,0.012891,0.041124,-0.053688,-0.017922,0.062374,0.014723,-0.061769
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4676,0.032339,-0.009164,-0.021189,-0.118866,0.004039,0.027564,-0.066725,-0.043910,0.034784,0.032176,...,0.103271,-0.045889,-0.045334,0.056114,0.020342,-0.011302,0.000263,0.025450,-0.024255,-0.127303
4677,0.050123,-0.014016,-0.027642,-0.073822,0.013764,0.044315,-0.034592,-0.021134,-0.060452,0.056930,...,0.032562,-0.027872,0.006586,0.057930,0.028743,-0.008503,-0.068848,-0.014838,0.010346,-0.085390
4678,-0.023709,-0.065159,-0.077550,-0.070221,0.021685,0.004570,-0.045600,-0.051468,0.020458,-0.014099,...,0.066425,0.008709,-0.038505,0.036229,0.004872,-0.014917,0.030409,-0.055730,0.010802,-0.104871
4679,0.047283,-0.016545,-0.031343,-0.071726,0.000178,0.047561,-0.036022,-0.029623,-0.049365,0.048976,...,0.022659,-0.016612,0.015457,0.060153,0.032717,0.004172,-0.039974,-0.014050,0.002892,-0.089768


In [None]:
# svr
model = SVR(cache_size=500)

hyperparameters = dict(kernel = ['rbf'],
                       C = uniform(loc=0.1, scale=1), 
                       epsilon = uniform(loc=0.01, scale=0.03)
                       )
print('hyperparameters:', hyperparameters)
SVR_RandomSearch = RandomSearch(df_full, impact, model, hyperparameters, n_iter=50)
Prediction_SVR = SVR_RandomSearch.model.predict(df_full)

# save model
#joblib.dump(SVR_RandomSearch.model, 'regress_svr.pkl')

In [None]:
# knn
model = KNeighborsRegressor()
hyperparameters = dict(weights = ['distance'], 
                       metric = ['minkowski', 'cosine'], 
                       algorithm = ['brute'],
                       p = [2,3],
                       n_neighbors = list(range(10,13)),
                       leaf_size = [1, 10, 20]
                       )

print('hyperparameters:', hyperparameters)
KNN_RandomSearch = RandomSearch(df_full, impact, model, hyperparameters, n_iter = 40)
Prediction_KNN = KNN_RandomSearch.model.predict(df_full)

# save model
#joblib.dump(KNN_RandomSearch.model, 'regress_knn.pkl')

hyperparameters: {'weights': ['distance'], 'metric': ['minkowski', 'cosine'], 'algorithm': ['brute'], 'p': [2, 3], 'n_neighbors': [10, 11, 12], 'leaf_size': [1, 10, 20]}
Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END algorithm=brute, leaf_size=1, metric=minkowski, n_neighbors=10, p=2, weights=distance; total time=   0.0s
[CV] END algorithm=brute, leaf_size=1, metric=minkowski, n_neighbors=10, p=2, weights=distance; total time=   0.0s
[CV] END algorithm=brute, leaf_size=1, metric=minkowski, n_neighbors=10, p=2, weights=distance; total time=   0.0s
[CV] END algorithm=brute, leaf_size=1, metric=minkowski, n_neighbors=10, p=2, weights=distance; total time=   0.0s
[CV] END algorithm=brute, leaf_size=1, metric=minkowski, n_neighbors=10, p=2, weights=distance; total time=   0.0s
[CV] END algorithm=brute, leaf_size=1, metric=minkowski, n_neighbors=10, p=3, weights=distance; total time=  38.0s
[CV] END algorithm=brute, leaf_size=1, metric=minkowski, n_neighbors=10, p=3, 

In [None]:
# Define models
knn = KNeighborsRegressor(**best_param_knn)
svr_linear = SVR(kernel='linear', C =0.5, epsilon= 0.02)
svr_rbf = SVR(cache_size=500, **best_param_svr)
rf_params = {'max_depth': 10, 'n_estimators': 200, 'min_samples_split': 3, 'min_samples_leaf': 3, 'criterion': 'squared_error'} 
rf = RandomForestRegressor(**rf_params)

# Stacking of regressions
estimators = [
    ('knn', knn),
    ('random forest', rf),
    ('svr', svr_linear),
]

stacking_model = StackingRegressor(estimators=estimators, final_estimator=svr_rbf)
stacking_model_fitted = stacking_model.fit(df_full, impact)

In [None]:
preds = stacking_model_fitted.predict(df_full)
mean_absolute_error(preds, impact)

0.10989070139382384

In [None]:
# Save model
#joblib.dump(best_model, 'StackingReg.pkl')

['StackingReg.pkl']