In [42]:
import mlflow
import pandas as pd
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [1]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("my-fakenews-exp")

2023/08/03 10:36:07 INFO mlflow.tracking.fluent: Experiment with name 'my-fakenews-exp' does not exist. Creating a new experiment.


<Experiment: artifact_location='./mlruns/1', creation_time=1691048167168, experiment_id='1', last_update_time=1691048167168, lifecycle_stage='active', name='my-fakenews-exp', tags={}>

In [20]:
import pandas as pd
import numpy as np
import io

df = pd.read_csv("../data_embeddings.csv",index_col=0)

In [22]:
def convert(item):
    item = item.strip()  # remove spaces at the end
    item = item[1:-1]    # remove `[ ]`
    item = np.fromstring(item, sep=' ')  # convert string to `numpy.array`
    return item

In [23]:
df['vector'] = df['vector'].apply(convert)

In [26]:
df = shuffle(df)

In [39]:
X = df['vector']
y = df['label']

In [44]:
X = np.stack(X)
y = np.stack(y)

In [45]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [46]:
X_train

array([[-1.68420076,  1.45088935,  0.61488074, ..., -2.21393132,
        -2.52793598,  3.36729574],
       [ 1.85769737, -0.08555415, -0.85656333, ..., -0.02918896,
        -2.04332209,  0.4353711 ],
       [ 2.41446924,  1.38886416,  1.1026305 , ...,  0.78477639,
        -4.37773943, -0.21611169],
       ...,
       [ 2.6249728 ,  1.3768976 , -0.15828745, ...,  0.68418068,
        -1.7570008 , -1.0424373 ],
       [-0.7922892 ,  2.16502476,  0.17109841, ..., -2.42561841,
        -2.25083399,  0.85438538],
       [ 1.0791308 ,  3.4378664 , -2.8512676 , ...,  0.86695004,
        -0.85227174,  0.8357892 ]])

In [48]:
with mlflow.start_run():
    mlflow.set_tag("data_scientist", "Rollan")

    mlflow.log_param("data_path", "../data_embeddings.csv")

    C = 0.03
    mlflow.log_param("regularization", C)

    lr = LogisticRegression(C=C)
    lr.fit(X_train, y_train)
 
    y_pred = lr.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mlflow.log_metric("accuracy", rmse)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [51]:
import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials  #helps to optimize hyperparameters
from hyperopt.pyll import scope

In [52]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_test, label=y_test)

In [59]:
def obj(params):

    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=30,
            evals=[(valid, "test")],
            early_stopping_rounds=20
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)
    
    return {"loss":rmse, "status":STATUS_OK}


In [60]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 30, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

best_result = fmin(
    fn=obj,
    space=search_space,
    algo=tpe.suggest,
    max_evals=10,
    trials=Trials()
)

[0]	test-rmse:0.47174                                 
[1]	test-rmse:0.44533                                 
[2]	test-rmse:0.42049                                 
[3]	test-rmse:0.39704                                 
[4]	test-rmse:0.37532                                 
[5]	test-rmse:0.35481                                 
[6]	test-rmse:0.33586                                 
[7]	test-rmse:0.31798                                 
[8]	test-rmse:0.30134                                 
[9]	test-rmse:0.28566                                 
[10]	test-rmse:0.27111                                
[11]	test-rmse:0.25742                                
[12]	test-rmse:0.24464                                
[13]	test-rmse:0.23285                                
[14]	test-rmse:0.22174                                
[15]	test-rmse:0.21140                                
[16]	test-rmse:0.20189                                
[17]	test-rmse:0.19291                                
[18]	test-

In [61]:
params = {
    'learning_rate': 0.13998783607276,
    'max_depth': 28,
    'min_child_weight':	0.8037214370553903,
    'objective': 'reg:linear',
    'reg_alpha': 0.007917567259199893,
    'reg_lambda': 0.00982476912100121,
    'seed': 42
}


mlflow.xgboost.autolog()
booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=30,
            evals=[(valid, "test")],
            early_stopping_rounds=20
        )

2023/08/03 17:28:13 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'd93c183ac28f4fac8d27ef526b0955fc', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


[0]	test-rmse:0.43365
[1]	test-rmse:0.37732
[2]	test-rmse:0.32988
[3]	test-rmse:0.29038
[4]	test-rmse:0.25754
[5]	test-rmse:0.23008
[6]	test-rmse:0.20772
[7]	test-rmse:0.18989
[8]	test-rmse:0.17535
[9]	test-rmse:0.16406
[10]	test-rmse:0.15544
[11]	test-rmse:0.14883
[12]	test-rmse:0.14395
[13]	test-rmse:0.14030
[14]	test-rmse:0.13739
[15]	test-rmse:0.13541
[16]	test-rmse:0.13378
[17]	test-rmse:0.13280
[18]	test-rmse:0.13198
[19]	test-rmse:0.13137
[20]	test-rmse:0.13087
[21]	test-rmse:0.13045
[22]	test-rmse:0.13012
[23]	test-rmse:0.12989
[24]	test-rmse:0.12970
[25]	test-rmse:0.12955
[26]	test-rmse:0.12943
[27]	test-rmse:0.12935
[28]	test-rmse:0.12927
[29]	test-rmse:0.12920


