In [1]:
import mlflow

In [19]:
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient
import joblib
# Thiết lập MLflow Tracking URI
mlflow_tracking_uri = "http://localhost:5001"  # Dùng tên service của MLflow trong Docker
mlflow.set_tracking_uri(mlflow_tracking_uri)

# Kiểm tra xem MLflow Tracking Server có nhận URI đúng không
print(f"🔥 MLflow Tracking URI: {mlflow.get_tracking_uri()}")

# Khởi tạo MLflow Client
client = MlflowClient()

# Lấy version mới nhất của model "Random_forest" theo các stage ["None", "Production", "Staging"]
# Lấy version theo alias "current"
model_version_info = client.get_model_version_by_alias("Random_forest", "Staging")
model_version = model_version_info.version

# Tạo URI để tải model
model_uri = f"models:/Random_forest/{model_version}"

# Load model
model = mlflow.sklearn.load_model(model_uri)
print(model)
model = joblib.load(f".{model}")
print(model)
print(f"✅ Model 'Random_forest' version {model_version} loaded successfully!")


🔥 MLflow Tracking URI: http://localhost:5001


Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 50.82it/s]

./src/weight/random_forest.pkl





RandomForestClassifier(min_samples_leaf=300, n_estimators=200, n_jobs=-1,
                       random_state=67)
✅ Model 'Random_forest' version 1 loaded successfully!


SyntaxError: invalid syntax (121980175.py, line 1)

In [None]:
import numpy as np
import pandas as pd
from itertools import groupby
from sklearn.model_selection import train_test_split
from pandas.api.types import is_datetime64_ns_dtype

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import lightgbm as lgb
from imblearn.under_sampling import RandomUnderSampler
from joblib import Parallel, delayed
import gc

import warnings
warnings.filterwarnings("ignore")

In [None]:
test = pd.read_csv("../data/raw/Generated_Test_Data.csv")
test.to_parquet("../data/raw/Generated_Test_Data.parquet")

In [None]:
test

In [None]:
train = pd.read_parquet("../data/raw/train_series.parquet")

In [None]:
train_filtered = train[train["step"] == 592375]
print(train_filtered)


In [None]:
def feat_eng(df):
    
    df['series_id'] = df['series_id'].astype('category')
    df['timestamp'] = pd.to_datetime(df['timestamp']).apply(lambda t: t.tz_localize(None))
    df['hour'] = df["timestamp"].dt.hour
    
    df.sort_values(['timestamp'], inplace=True)
    df.set_index('timestamp', inplace=True)
    
    df['lids'] = np.maximum(0., df['enmo'] - 0.02)
    df['lids'] = df['lids'].rolling(f'{120*5}s', center=True, min_periods=1).agg('sum')
    df['lids'] = 100 / (df['lids'] + 1)
    df['lids'] = df['lids'].rolling(f'{360*5}s', center=True, min_periods=1).agg('mean').astype(np.float32)
    
    df["enmo"] = (df["enmo"]*1000).astype(np.int16)
    df["anglez"] = df["anglez"].astype(np.int16)
    df["anglezdiffabs"] = df["anglez"].diff().abs().astype(np.float32)
    
    for col in ['enmo', 'anglez', 'anglezdiffabs']:
        
        # periods in seconds        
        periods = [60, 360, 720, 3600] 
        
        for n in periods:
            
            rol_args = {'window':f'{n+5}s', 'min_periods':10, 'center':True}
            
            for agg in ['median', 'mean', 'max', 'min', 'var']:
                df[f'{col}_{agg}_{n}'] = df[col].rolling(**rol_args).agg(agg).astype(np.float32).values
                gc.collect()
            
            if n == max(periods):
                df[f'{col}_mad_{n}'] = (df[col] - df[f'{col}_median_{n}']).abs().rolling(**rol_args).median().astype(np.float32)
            
            df[f'{col}_amplit_{n}'] = df[f'{col}_max_{n}']-df[f'{col}_min_{n}']
            df[f'{col}_amplit_{n}_min'] = df[f'{col}_amplit_{n}'].rolling(**rol_args).min().astype(np.float32).values
            
#             if col in ['enmo', 'anglez']:
            df[f'{col}_diff_{n}_max'] = df[f'{col}_max_{n}'].diff().abs().rolling(**rol_args).max().astype(np.float32)
            df[f'{col}_diff_{n}_mean'] = df[f'{col}_max_{n}'].diff().abs().rolling(**rol_args).mean().astype(np.float32)

    
            gc.collect()
    
    df.reset_index(inplace=True)
    df.dropna(inplace=True)

    return df

In [None]:
file = "../data/processed/merged_data.parquet"


In [None]:
def feat_eng_by_id(idx):
    
    from warnings import simplefilter 
    simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
    
    df  = pd.read_parquet(file, filters=[('series_id','=',idx)])
    df['event'] = df['event'].astype(np.int8)
    df = feat_eng(df)
    
    return df


In [None]:
DEV = False

series_id  = pd.read_parquet(file, columns=['series_id'])
series_id = series_id.series_id.unique()

print(len(series_id))

if DEV:
    series_id = series_id[::10]

In [None]:
%%time

train = Parallel(n_jobs=6)(delayed(feat_eng_by_id)(i) for i in series_id)
train = pd.concat(train, ignore_index=True)

In [None]:
train

In [None]:
# REDUCE train data by half
step=400 if DEV else 60
train = train.iloc[::step]
step

In [None]:
drop_cols = ['series_id', 'step', 'timestamp']

X, y = train.drop(columns=drop_cols+['event']), train['event']

gc.collect()


In [None]:
if not DEV:
    del train
    gc.collect()

In [None]:
class EnsembleAvgProba():
    
    def __init__(self, classifiers):
        
        self.classifiers = classifiers
    
    def fit(self,X,y):
        
        for classifier in self.classifiers:                
            classifier.fit(X, y)
            gc.collect()
     
    def predict_proba(self, X):
        
        probs = []
        
        for m in self.classifiers:
            probs.append(m.predict_proba(X))
        
        probabilities = np.stack(probs)
        p = np.mean(probabilities, axis=0)
        
        return p 
    
    def predict(self, X):
        
        probs = []
        
        for m in self.classifiers:
            probs.append(m.predict(X))
        
        probabilities = np.stack(probs)
        p = np.mean(probabilities, axis=0)
        
        return p.round()

In [None]:
from catboost import CatBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
import xgboost as xgb

lgb_params1 = {    
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'max_depth': 6,
    'learning_rate': 0.03,
    'n_estimators': 1,
    'subsample_for_bin': 200000,
    'min_child_weight': 0.001,
    'min_child_samples': 20,
    'subsample': 0.9,
    'colsample_bytree': 0.7,  # Uncommented this line
    'reg_alpha': 0.05,
    'reg_lambda': 0.05,
}

xgb_params = {
    'n_estimators': 1,
    'objective': "binary:logistic",
    'learning_rate': 0.02,
    'max_depth': 7,
    'subsample': 0.9,
    'colsample_bytree': 0.7,
    'random_state': 42
}

cat_params = {
    'iterations': 2,
    'learning_rate': 0.03,
    'depth': 6,
    'random_state': 42,
    'verbose': 0  # to prevent training output, remove or set to a larger value to see training progress
}

model = EnsembleAvgProba(classifiers=[
    lgb.LGBMClassifier(random_state=42, **lgb_params1),
    GradientBoostingClassifier(n_estimators=1, max_depth=5, min_samples_leaf=300, random_state=42),
    RandomForestClassifier(n_estimators=1, min_samples_leaf=300, random_state=42, n_jobs=-1),
    xgb.XGBClassifier(**xgb_params),
    CatBoostClassifier(**cat_params),
    ExtraTreesClassifier(n_estimators=1, min_samples_leaf=300, random_state=42, n_jobs=-1)
])



In [None]:
model.fit(X, y)


In [None]:
def get_events(idx, classifier, file='test_series.parquet') :
    
    test  = pd.read_parquet(f'../data/raw/{file}',
                    filters=[('series_id','=',idx)])
    test = feat_eng(test)
    X_test = test.drop(columns=drop_cols)
    test = test[drop_cols]

    preds, probs = classifier.predict(X_test), classifier.predict_proba(X_test)[:, 1]
    
    test['prediction'] = preds
    test['prediction'] = test['prediction'].rolling(360+1, center=True).median()
    test['probability'] = probs
    
    test = test[test['prediction']!=2]
    
    test.loc[test['prediction']==0, 'probability'] = 1-test.loc[test['prediction']==0, 'probability']
    test['score'] = test['probability'].rolling(60*12*5, center=True, min_periods=10).mean().bfill().ffill()

    
    test['pred_diff'] = test['prediction'].diff()
    
    test['event'] = test['pred_diff'].replace({1:'wakeup', -1:'onset', 0:np.nan})
    
    test_wakeup = test[test['event']=='wakeup'].groupby(test['timestamp'].dt.date).agg('first')
    test_onset = test[test['event']=='onset'].groupby(test['timestamp'].dt.date).agg('last')
    test = pd.concat([test_wakeup, test_onset], ignore_index=True).sort_values('timestamp')

    return test

In [None]:
cols_sub = ['series_id','step','event','score']

series_id  = pd.read_parquet('../data/raw/test_series.parquet', columns=['series_id'])
series_id = series_id.series_id.unique()

tests = []

for idx in series_id: 

    test = get_events(idx, model)
    tests.append(test[cols_sub])

In [None]:
submission = pd.concat(tests, ignore_index=True).reset_index(names='row_id')


In [None]:
submission