Skip to content

Commit

Permalink
Merge pull request #19 from Alex-Lekov/develop
Browse files Browse the repository at this point in the history
0.11.24
  • Loading branch information
Alex-Lekov committed Nov 23, 2020
2 parents d4198df + a0d97a0 commit 811c64a
Show file tree
Hide file tree
Showing 12 changed files with 1,928 additions and 287 deletions.
32 changes: 32 additions & 0 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
{
"name": "Existing Dockerfile",

// Sets the run context to one level up instead of the .devcontainer folder.
"context": "..",

// Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
"dockerFile": "../Dockerfile",

// Set *default* container specific settings.json values on container create.
"settings": {
"terminal.integrated.shell.linux": null
},

// Add the IDs of extensions you want installed when the container is created.
"extensions": []

// Use 'forwardPorts' to make a list of ports inside the container available locally.
// "forwardPorts": [],

// Uncomment the next line to run commands after the container is created - for example installing curl.
// "postCreateCommand": "apt-get update && apt-get install -y curl",

// Uncomment when using a ptrace-based debugger like C++, Go, and Rust
// "runArgs": [ "--cap-add=SYS_PTRACE", "--security-opt", "seccomp=unconfined" ],

// Uncomment to use the Docker CLI from inside the container. See https://aka.ms/vscode-remote/samples/docker-from-docker.
// "mounts": [ "source=/var/run/docker.sock,target=/var/run/docker.sock,type=bind" ],

// Uncomment to connect as a non-root user if you've added one. See https://aka.ms/vscode-remote/containers/non-root.
// "remoteUser": "vscode"
}
23 changes: 22 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,33 @@ All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

## [0.11.24]

## [0.07.26]
### ADD
- multivariate TPE sampler. This algorithm captures dependencies among hyperparameters better than the previous algorithm

### Fix
- "ValueError non-broadcastable output operand..." in AutoMLRegressor


## [0.10.07]

### Fix
- DataConversionWarning in sklearn_models model.fit(X_train, y_train,)


## [0.10.04]

### Fix
- verbose in LinearRegression


## [0.08.05]

### Fix
- if y_train is not pd.DataFrame


## [0.07.26]

### Add
Expand Down
2 changes: 1 addition & 1 deletion automl_alex/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.08.05"
__version__ = "0.11.24"
173 changes: 93 additions & 80 deletions automl_alex/automl_alex.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from .models import *
from .databunch import DataBunch
from .encoders import *
from sklearn.preprocessing import StandardScaler


##################################### BestSingleModel ################################################
Expand Down Expand Up @@ -477,95 +478,107 @@ def opt(self,

###############################################################
# STEP 3
self.stack_models_predicts = pd.concat([predicts_1, predicts_2], ignore_index=True, sort=False)
self.stack_models_cfgs = pd.concat([stack_models_1_cfgs, stack_model_2_cfgs], ignore_index=True, sort=False)

score_mean_stack_models = self.metric(self._data.y_train, self.stack_models_predicts['predict_train'].mean())
if verbose > 0:
print(f'\n Mean Models {self.metric.__name__} Score Train: ', \
round(score_mean_stack_models, self._metric_round))
print('_'*50)
print('Step 2: Stacking')
print('_'*50)
time.sleep(0.1) # clean print
self.stack_models_predicts = pd.concat([predicts_1, predicts_2], ignore_index=True, sort=False)
self.stack_models_cfgs = pd.concat([stack_models_1_cfgs, stack_model_2_cfgs], ignore_index=True, sort=False)

# Stacking
X_train_predicts = pd.DataFrame([*self.stack_models_predicts['predict_train']]).T
X_train_predicts.columns = self.stack_models_predicts.model_name.values
X_test_predicts = pd.DataFrame([*self.stack_models_predicts['predict_test']],).T
X_test_predicts.columns = self.stack_models_predicts.model_name.values

self._data.X_train = X_train_predicts.reset_index(drop=True)
self._data.X_test = X_test_predicts.reset_index(drop=True)
self._data.y_train = self._data.y_train.reset_index(drop=True)

print('New X_train: ', self._data.X_train.shape,
' y_train: ', self._data.y_train.shape,
'| X_test shape: ', self._data.X_test.shape)
score_mean_stack_models = self.metric(self._data.y_train, self.stack_models_predicts['predict_train'].mean())
if verbose > 0:
print('-'*50)
print(f'\n Blend Models {self.metric.__name__} Score Train: ', \
round(score_mean_stack_models, self._metric_round))
print('_'*50)
print('Step 2: Stacking')
print('_'*50)
time.sleep(0.1) # clean print

if self.type_of_estimator == 'regression':
pred_test = (self.stack_models_predicts['predict_test'].mean() * 0.7) + \
(self.predicts_model_1_full_x['predict_test'].mean() * 0.2) + \
(self.predicts_model_0_full_x['predict_test'].mean() * 0.1)

pred_train = self.stack_models_predicts['predict_train'].mean()

self.history_trials = []
self.history_trials_dataframe = pd.DataFrame()

stack_model_1 = BestSingleModel(databunch=self._data,
cv=10,
score_cv_folds = 10,
metric=self.metric,
direction=self.direction,
metric_round=self._metric_round,
combined_score_opt=False,
gpu=self._gpu,
random_state=self._random_state,
type_of_estimator=self.type_of_estimator,)
else:
# Stacking
X_train_predicts = pd.DataFrame([*self.stack_models_predicts['predict_train']]).T
X_train_predicts.columns = self.stack_models_predicts.model_name.values
X_test_predicts = pd.DataFrame([*self.stack_models_predicts['predict_test']],).T
X_test_predicts.columns = self.stack_models_predicts.model_name.values

# Opt
history_stack_model = stack_model_1.opt(
iterations=150,
#timeout=100,
opt_lvl=3,
auto_parameters=False,
cold_start=25,
feature_selection=True,
models_names=['LinearModel',],
verbose= (lambda x: 0 if x <= 1 else 1)(verbose), )
scaler = StandardScaler()

self._data.X_train = pd.DataFrame(scaler.fit_transform(X_train_predicts.reset_index(drop=True)))
self._data.X_test = pd.DataFrame(scaler.transform(X_test_predicts.reset_index(drop=True)))
self._data.y_train = self._data.y_train.reset_index(drop=True)

print('New X_train: ', self._data.X_train.shape,
' y_train: ', self._data.y_train.shape,
'| X_test shape: ', self._data.X_test.shape)

# Predict
history_stack_model = history_stack_model.drop_duplicates(subset=['model_score', 'score_std'], keep='last')
predict_stack_model = stack_model_1.predict(models_cfgs=history_stack_model.head(2))

# Score:
score_final_stack_model = self.metric(self._data.y_train, predict_stack_model['predict_train'].mean())
if verbose > 0:
print(f'\n Stacking model {self.metric.__name__} Score Train: ',
round(score_final_stack_model, self._metric_round))
time.sleep(0.1) # clean print
self.history_trials = []
self.history_trials_dataframe = pd.DataFrame()

pred_test = predict_stack_model['predict_test'].mean()
pred_train = predict_stack_model['predict_train'].mean()

if self.direction == 'maximize':
if score_mean_stack_models >= score_final_stack_model and score_mean_stack_models >= score_mean_models_1:
pred_test = self.stack_models_predicts['predict_test'].mean()
pred_train = self.stack_models_predicts['predict_train'].mean()
if score_mean_models_1 >= score_final_stack_model and score_mean_models_1 >= score_mean_stack_models:
pred_test = predicts_1['predict_test'].mean()
pred_train = predicts_1['predict_train'].mean()
else:
if score_mean_stack_models <= score_final_stack_model and score_mean_stack_models <= score_mean_models_1:
pred_test = self.stack_models_predicts['predict_test'].mean()
pred_train = self.stack_models_predicts['predict_train'].mean()
if score_mean_models_1 <= score_final_stack_model and score_mean_models_1 <= score_mean_stack_models:
pred_test = predicts_1['predict_test'].mean()
pred_train = predicts_1['predict_train'].mean()
stack_model_1 = BestSingleModel(databunch=self._data,
cv=10,
score_cv_folds = 10,
metric=self.metric,
direction=self.direction,
metric_round=self._metric_round,
combined_score_opt=False,
gpu=self._gpu,
random_state=self._random_state,
type_of_estimator=self.type_of_estimator,
clean_and_encod_data=False,)

# Opt
history_stack_model = stack_model_1.opt(
iterations=150,
#timeout=100,
opt_lvl=3,
auto_parameters=False,
cold_start=25,
feature_selection=False,
models_names=['LinearModel', 'MLP',],
verbose= (lambda x: 0 if x <= 1 else 1)(verbose), )

# Predict
history_stack_model = history_stack_model.drop_duplicates(subset=['model_score', 'score_std'], keep='last')
predict_stack_model = stack_model_1.predict(models_cfgs=history_stack_model.head(2), databunch=self._data,)

# Score:
score_final_stack_model = self.metric(self._data.y_train, predict_stack_model['predict_train'].mean())
if verbose > 0:
print(f'\n Stacking model {self.metric.__name__} Score Train: ',
round(score_final_stack_model, self._metric_round))
time.sleep(0.1) # clean print

pred_test = predict_stack_model['predict_test'].mean()
pred_train = predict_stack_model['predict_train'].mean()

if self.direction == 'maximize':
if score_mean_stack_models >= score_final_stack_model and score_mean_stack_models >= score_mean_models_1:
pred_test = self.stack_models_predicts['predict_test'].mean()
pred_train = self.stack_models_predicts['predict_train'].mean()
if score_mean_models_1 >= score_final_stack_model and score_mean_models_1 >= score_mean_stack_models:
pred_test = predicts_1['predict_test'].mean()
pred_train = predicts_1['predict_train'].mean()
else:
if score_mean_stack_models <= score_final_stack_model and score_mean_stack_models <= score_mean_models_1:
pred_test = self.stack_models_predicts['predict_test'].mean()
pred_train = self.stack_models_predicts['predict_train'].mean()
if score_mean_models_1 <= score_final_stack_model and score_mean_models_1 <= score_mean_stack_models:
pred_test = predicts_1['predict_test'].mean()
pred_train = predicts_1['predict_train'].mean()

pred_test = (pred_test * 0.7) + \
(self.predicts_model_1_full_x['predict_test'].mean() * 0.2) + \
(self.predicts_model_0_full_x['predict_test'].mean() * 0.1)

final_score_model = self.metric(self._data.y_train, pred_train)
# print score
if verbose > 0:
final_score_model = self.metric(self._data.y_train, pred_train)
print(f'\n Final Model {self.metric.__name__} Score Train: ', \
round(final_score_model, self._metric_round))
time.sleep(0.1) # clean print

pred_test = (pred_test * 0.7) + \
(self.predicts_model_1_full_x['predict_test'].mean() * 0.2) + \
(self.predicts_model_0_full_x['predict_test'].mean() * 0.1)

return (pred_test, pred_train)

Expand Down
17 changes: 9 additions & 8 deletions automl_alex/models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,13 +445,14 @@ def objective(trial, fast_check=True):
self._tqdm_opt_print(pbar)
return score_opt

sampler=optuna.samplers.TPESampler(consider_prior=True,
prior_weight=1.0,
consider_magic_clip=True,
consider_endpoints=False,
sampler=optuna.samplers.TPESampler(#consider_prior=True,
#prior_weight=1.0,
#consider_magic_clip=True,
#consider_endpoints=False,
n_startup_trials=self._cold_start,
n_ei_candidates=50,
seed=self._random_state)
#n_ei_candidates=50,
seed=self._random_state,
multivariate=True,)
if self.study is None:
self.study = optuna.create_study(direction=self.direction, sampler=sampler,)

Expand Down Expand Up @@ -680,8 +681,8 @@ def cross_val(self,
)

folds_scores = []
stacking_y_pred_train = np.zeros(X.shape[0])
stacking_y_pred_test = np.zeros(X_test.shape[0])
stacking_y_pred_train = np.zeros(len(X))
stacking_y_pred_test = np.zeros(len(X_test))
feature_importance_df = pd.DataFrame(np.zeros(len(X.columns)), index=X.columns)

for i, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
Expand Down
10 changes: 6 additions & 4 deletions automl_alex/models/sklearn_models.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
from sklearn import ensemble, neural_network, linear_model, svm, neighbors
from .base import *
import numpy as np

from warnings import simplefilter, filterwarnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.exceptions import ConvergenceWarning, DataConversionWarning
simplefilter("ignore", category=ConvergenceWarning)
filterwarnings("ignore", category=ConvergenceWarning, message="^Maximum number of iteration reached")
filterwarnings("ignore", category=ConvergenceWarning, message="^Liblinear failed to converge")
simplefilter("ignore", category=DataConversionWarning)


################################## LogRegClassifier ##########################################################
################################## LinearModel ##########################################################

class LinearModel(ModelBase):
"""
Expand All @@ -22,7 +24,7 @@ def _init_default_model_param(self,):
"""
Default model_param
"""
model_param = {'verbose':0,}
model_param = {}
return(model_param)

def _init_model(self, model_param=None):
Expand Down Expand Up @@ -80,7 +82,7 @@ def _fit(self, model=None, X_train=None, y_train=None, X_test=None, y_test=None,
model = self
if (X_train is None) or (y_train is None):
X_train = model._data.X_train
y_train = model._data.y_train
y_train = np.array(model._data.y_train.values.ravel())

model.model = model._init_model(model_param=model.model_param)
model.model.fit(X_train, y_train,)
Expand Down

0 comments on commit 811c64a

Please sign in to comment.