In [4]:
import numpy as np
import pandas as pd
from lazypredict.Supervised import LazyClassifier
import pickle
from sklearn.model_selection import train_test_split
import pathlib
import os
from sklearn.preprocessing import StandardScaler, OneHotEncoder



In [5]:
PROJECT_PATH = os.getcwd()
PROJECT_PATH = os.path.join(PROJECT_PATH, "..")
MODELS_DIR = pathlib.Path(PROJECT_PATH) / "store" / "models"

DATASET_DIR = pathlib.Path(PROJECT_PATH) / "data"

In [27]:
SAMPLE_SIZE = 0.2


def preprocess(X: pd.DataFrame, y: pd.Series, path: pathlib.Path, sample_size: float = SAMPLE_SIZE):
    # Identify categorical and numeric columns
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
    numeric_cols = X.select_dtypes(include=['number']).columns.tolist()
    
    # Initialize lists to store processed columns
    processed_columns = []

    # If there are categorical columns, apply one-hot encoding
    if categorical_cols:
        print("Encoding categorical columns...")
        onehot_encoder = OneHotEncoder(categories='auto', sparse=False)
        X_categorical = pd.DataFrame(onehot_encoder.fit_transform(X[categorical_cols]),
                                     columns=onehot_encoder.get_feature_names_out(categorical_cols))
        processed_columns.append(X_categorical)
    
    # Apply standard scaling to the numeric columns
    if numeric_cols:
        print("Scaling numerical columns...")
        scaler = StandardScaler()
        X_numeric = pd.DataFrame(scaler.fit_transform(X[numeric_cols]), columns=numeric_cols)
        processed_columns.append(X_numeric)
    
    # Combine the processed columns
    if processed_columns:
        X_processed = pd.concat(processed_columns, axis=1)
    else:
        X_processed = X.copy()  # If there are no categorical or numeric columns, keep the original dataframe
    
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.1, random_state=42)   
    
    # Scale the training and test sets
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Sample a subset of the training data
    _, X_sample, _, y_sample = train_test_split(X_train, y_train, test_size=sample_size, random_state=42)
    
    # Print the shapes of the datasets
    print("X_train shape:", X_train.shape)
    print("X_test shape:", X_test.shape)
    print("y_train shape:", y_train.shape)
    print("y_test shape:", y_test.shape)
    print("X_sample shape:", X_sample.shape)
    print("y_sample shape:", y_sample.shape)
    
    # Save the processed data
    with open(path / f'dataset_{sample_size}.pkl', 'wb') as f:
        pickle.dump([X_sample, y_sample, X_test, y_test], f)
        
    return X_train, y_train, X_test, y_test

In [43]:
def train_and_save_models(X_train, y_train, X_test, y_test):
    clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
    _,predictions = clf.fit(X_train, X_test, y_train, y_test)
    models_to_save = "RandomForestClassifier-BernoulliNB-CalibratedClassifierCV-LinearDiscriminantAnalysis-LogisticRegression-AdaBoostClassifier-ExtraTreesClassifier-XGBClassifier-LGBMClassifier".split("-")
    models = []
    for pipe in clf.models:
        for model in models_to_save:
            if model == pipe[0]:
                models.append(pipe)
    
    with open(MODELS_DIR / f'{DATASET_NAME}_{SAMPLE_SIZE}_models.pkl', 'wb') as f:
        pickle.dump(models, f)
    
    return predictions
    

# Gesture Phase

In [8]:
DATASET_NAME = "gesture_phase"

DATASET_PATH = DATASET_DIR / DATASET_NAME
DATASET_PATH.mkdir(parents=True, exist_ok=True)

In [18]:
dataset = pd.read_csv(DATASET_PATH / f"{DATASET_NAME}.csv")
dataset = dataset.iloc[:, 20:]
dataset

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,24,25,26,27,28,29,30,31,32,Phase
0,-0.01,-0.00,0.00,0.01,0.01,0.00,-0.00,0.00,-0.00,0.01,...,0.00,0.01,0.01,0.00,0.01,0.00,0.00,0.00,0.00,D
1,0.00,0.00,-0.00,0.00,0.00,0.00,-0.00,-0.00,0.00,0.00,...,-0.00,0.01,0.01,0.00,0.00,0.00,0.00,0.00,0.00,D
2,-0.00,-0.00,0.00,0.00,0.00,0.00,-0.00,0.00,0.00,0.00,...,-0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,D
3,-0.00,-0.00,0.00,0.00,0.00,0.00,-0.00,-0.00,0.00,0.00,...,-0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,D
4,-0.00,-0.00,0.00,0.00,0.00,0.00,-0.00,0.00,0.00,0.00,...,-0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,D
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9868,-0.00,-0.01,0.00,-0.00,0.00,-0.00,-0.00,-0.00,0.00,0.00,...,-0.00,0.01,0.00,0.01,0.00,0.00,0.00,0.00,0.00,D
9869,-0.00,0.00,-0.00,0.00,-0.00,0.00,-0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,D
9870,0.00,0.01,-0.00,0.00,-0.00,0.00,0.00,0.01,-0.00,0.00,...,0.00,0.01,0.00,0.01,0.00,0.00,0.00,0.00,0.00,D
9871,0.00,0.01,-0.00,0.00,-0.00,0.00,0.00,0.01,-0.00,0.00,...,0.00,0.01,0.00,0.01,0.00,0.00,0.00,0.00,0.00,D


In [19]:
X, y = dataset.iloc[:, :-1], dataset.iloc[:, -1]

In [20]:
y = y.replace({"S":0,"D":1,"P":2, "R":3, "H":4}).astype(int)
y.value_counts()

Phase
0    2950
1    2741
2    2097
3    1087
4     998
Name: count, dtype: int64

In [41]:
X_train, y_train, X_test, y_test = preprocess(X, y, DATASET_PATH)

Scaling numerical columns...
X_train shape: (8885, 32)
X_test shape: (988, 32)
y_train shape: (8885,)
y_test shape: (988,)
X_sample shape: (1777, 32)
y_sample shape: (1777,)


In [44]:
train_and_save_models(X_train, y_train, X_test, y_test)

 97%|█████████▋| 28/29 [00:38<00:01,  1.38s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001432 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8160
[LightGBM] [Info] Number of data points in the train set: 8885, number of used features: 32
[LightGBM] [Info] Start training from score -1.203785
[LightGBM] [Info] Start training from score -1.283797
[LightGBM] [Info] Start training from score -1.548317
[LightGBM] [Info] Start training from score -2.216888
[LightGBM] [Info] Start training from score -2.288614


100%|██████████| 29/29 [00:39<00:00,  1.37s/it]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
XGBClassifier,0.71,0.66,,0.7,1.97
ExtraTreesClassifier,0.72,0.64,,0.7,1.85
LGBMClassifier,0.69,0.63,,0.68,0.98
RandomForestClassifier,0.7,0.62,,0.69,9.24
KNeighborsClassifier,0.63,0.56,,0.62,0.07
LabelSpreading,0.62,0.55,,0.61,3.51
LabelPropagation,0.62,0.55,,0.61,2.61
BaggingClassifier,0.62,0.55,,0.61,4.93
DecisionTreeClassifier,0.54,0.49,,0.54,0.78
ExtraTreeClassifier,0.49,0.47,,0.5,0.03


# Adult

In [6]:
DATASET_NAME = "adult"

DATASET_PATH = DATASET_DIR / DATASET_NAME
DATASET_PATH.mkdir(parents=True, exist_ok=True)

In [7]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = pd.DataFrame(adult.data.features) 
y = pd.DataFrame(adult.data.targets)
  

In [8]:

y['income'] = y['income'].replace({'<=50K': 0, '>50K': 1, "<=50K.": 0, ">50K.": 1})
y.value_counts()

income
0         37155
1         11687
Name: count, dtype: int64

In [None]:
X_train, y_train, X_test, y_test = preprocess(X, y)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(OneHotEncoder(categories='auto', sparse=False).fit_transform(X.sample(100)), y.sample(100), test_size=0.1, random_state=42)   
type(y_train)

pandas.core.frame.DataFrame

In [24]:
pd.DataFrame(X_train).values

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])