In [296]:
import numpy as np
import pandas as pd
import time
import gc
from sklearn.model_selection import GroupShuffleSplit, GroupKFold, StratifiedKFold, GridSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
gc.enable()

In [38]:
df = pd.read_csv('data/Postures.csv', skiprows=[1])
df.replace("?", np.nan, inplace=True)    # replace ? as nan
df.head()

Unnamed: 0,Class,User,X0,Y0,Z0,X1,Y1,Z1,X2,Y2,...,Z8,X9,Y9,Z9,X10,Y10,Z10,X11,Y11,Z11
0,1,0,54.26388,71.466776,-64.807709,76.895635,42.4625,-72.780545,36.621229,81.680557,...,,,,,,,,,,
1,1,0,56.527558,72.266609,-61.935252,39.135978,82.53853,-49.596509,79.223743,43.254091,...,,,,,,,,,,
2,1,0,55.849928,72.469064,-62.562788,37.988804,82.631347,-50.606259,78.451526,43.567403,...,,,,,,,,,,
3,1,0,55.329647,71.707275,-63.688956,36.561863,81.868749,-52.752784,86.32063,68.214645,...,,,,,,,,,,
4,1,0,55.142401,71.435607,-64.177303,36.175818,81.556874,-53.475747,76.986143,42.426849,...,,,,,,,,,,


## Gesture

In [320]:
def reduced_feature_training_posture(X, y, group_feature, model, param_grid, random_state=42):
    """
    Train a reduced feature model for the hand gesture dataset
    Return the unique feature patterns and the correponding models and accuracies
    """
    gss = GroupShuffleSplit(n_splits=1, train_size=.8, random_state=random_state)
    other_idx, test_idx = next(gss.split(X, y, User))
    # extract X_other, X_test, y_other, y_test, User_other, User_test
    X_other, y_other = X.iloc[other_idx], y.iloc[other_idx]
    X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]
    User_other = User.iloc[other_idx]
    
    ## find all unique patterns of missing value in test set
    mask = X_test.isnull()
    unique_rows = np.array(np.unique(mask, axis=0))
    all_y_test_pred = pd.DataFrame()
    
    print('there are', len(unique_rows), 'unique missing value patterns.')
    
    reduced_model = {"sub_cols": [], "model": []}
    # divide test sets into subgroups according to the unique patterns
    for i, unique_rows_sub in enumerate(unique_rows):
#         print ('working on unique pattern', i)
        
        ## choose the according reduced features for subgroups
        # get feature column names in this particular pattern
        sub_cols = X_test.columns[~unique_rows_sub]
        reduced_model['sub_cols'].append(sub_cols)
        # 1.get the feature columns according to this pattern
        sub_X_other = X_other[sub_cols]
        # 2.get take sub_rows and sub_cols for X_test on this pattern
        sub_rows = np.sum(mask == unique_rows_sub, axis=1) == unique_rows_sub.shape[0]
        sub_X_test = X_test.loc[sub_rows, sub_cols]
        # 3.cut the rows in the sub_X_other that have any nans
        sub_X_other = sub_X_other.dropna()
        # 4.cut the sub_Y_other and sub_y_test accordingly
        sub_y_other = y_other.loc[sub_X_other.index]
        sub_y_test = y_test.loc[sub_X_test.index]
        # 5.cut user for group kfold
        sub_User_other = User_other.loc[sub_X_other.index]
#         print(len(set(sub_y_test)))
        if len(set(sub_y_test)) == 1:
            return 1
    return 0
#         ## prepare pipe
#         numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
#         preprocessor = ColumnTransformer(remainder="passthrough",
#                                          transformers=[("num", numeric_transformer, sub_cols)])
#         pipe = Pipeline(steps=[("preprocessor", preprocessor),
#                                ("classifier", model)])
#         # GroupKFold, k = min(4, length of unique value)
#         gkf = GroupKFold(n_splits=min(4, len(sub_User_other.unique())))
#         cv_idx = list(gkf.split(sub_X_other, sub_y_other, sub_User_other))
#         # init grid
#         grid = GridSearchCV(pipe, param_grid=param_grid, cv=cv_idx, 
#                             scoring=make_scorer(accuracy_score), 
#                             iid=False, n_jobs=-1)
#         le = LabelEncoder()
#         # label encode y incase there are class missing
#         sub_y_other = le.fit_transform(sub_y_other)
#         ## start training
#         b_time = time.time()
#         grid.fit(sub_X_other, sub_y_other)
#         e_time = time.time()
#         print(f"took {round(e_time - b_time, ndigits=2)}s")
#         # refit using the best hyperparameters
#         pipe.set_params(**grid.best_params_)
#         pipe.fit(sub_X_other, sub_y_other)
#         reduced_model["model"].append(pipe)

#         ## predict y, transform back and save
#         sub_y_test_pred = pipe.predict(sub_X_test)
#         print(f"accuracy: {accuracy_score(le.transform(sub_y_test), sub_y_test_pred)}")
#         sub_y_test_pred = pd.DataFrame(le.inverse_transform(sub_y_test_pred), 
#                                        columns=['sub_y_test_pred'], index=sub_y_test.index)
#         all_y_test_pred = all_y_test_pred.append(sub_y_test_pred)
        
#     all_y_test_pred = all_y_test_pred.sort_index()
#     y_test = y_test.sort_index()
#     total_acc = accuracy_score(y_test, all_y_test_pred)
#     return reduced_model, total_acc

In [327]:
y = df['Class']
User = df['User']
X = df.iloc[:, 2:]
param_grid_logi = {'classifier__C': 1/np.logspace(-3,5,3)}
model_logi = LogisticRegression(penalty='l1', solver='saga', max_iter = 10000, multi_class='auto')
count = 0
for s in range(20):
    count += reduced_feature_training_posture(X, y, User, model_logi, param_grid_logi, 42+s)
print()
print(f"split with only 1 class occurred {count} out of 20")

there are 10 unique missing value patterns.
there are 9 unique missing value patterns.
there are 10 unique missing value patterns.
there are 10 unique missing value patterns.
there are 10 unique missing value patterns.
there are 9 unique missing value patterns.
there are 10 unique missing value patterns.
there are 9 unique missing value patterns.
there are 9 unique missing value patterns.
there are 10 unique missing value patterns.
there are 10 unique missing value patterns.
there are 10 unique missing value patterns.
there are 9 unique missing value patterns.
there are 10 unique missing value patterns.
there are 10 unique missing value patterns.
there are 9 unique missing value patterns.
there are 10 unique missing value patterns.
there are 10 unique missing value patterns.
there are 9 unique missing value patterns.
there are 10 unique missing value patterns.

split with only 1 class occurred 16 out of 20


## User

In [322]:
def reduced_feature_training_user(X, y, stratify_ref, model, param_grid, random_state=42):
    """
    Train a reduced feature model for the hand gesture dataset
    Return the unique feature patterns and the correponding models and accuracies
    """
    X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state, stratify=stratify_ref)
    stratify_kfold_ref = stratify_ref.iloc[X_other.index]
    
    ## find all unique patterns of missing value in test set
    mask = X_test.isnull()
    unique_rows = np.array(np.unique(mask, axis=0))
    all_y_test_pred = pd.DataFrame()
    
    print('there are', len(unique_rows), 'unique missing value patterns.')
    
    reduced_model = {"sub_cols": [], "model": []}
    # divide test sets into subgroups according to the unique patterns
    for i, unique_rows_sub in enumerate(unique_rows):
#         print ('working on unique pattern', i)
        
        ## choose the according reduced features for subgroups
        # get feature column names in this particular pattern
        sub_cols = X_test.columns[~unique_rows_sub]
        reduced_model['sub_cols'].append(sub_cols)
        # 1.get the feature columns according to this pattern
        sub_X_other = X_other[sub_cols]
        # 2.get take sub_rows and sub_cols for X_test on this pattern
        sub_rows = np.sum(mask == unique_rows_sub, axis=1) == unique_rows_sub.shape[0]
        sub_X_test = X_test.loc[sub_rows, sub_cols]
        # 3.cut the rows in the sub_X_other that have any nans
        sub_X_other = sub_X_other.dropna()
        # 4.cut the sub_Y_other and sub_y_test accordingly
        sub_y_other = y_other.loc[sub_X_other.index]
        sub_y_test = y_test.loc[sub_X_test.index]
        # 5.cut user for group kfold
        sub_stratify_kfold_ref = stratify_kfold_ref.loc[sub_X_other.index]
#         print(f"number of class: {len(set(sub_y_test))}")
        if len(set(sub_y_test)) == 1:
            return 1
    return 0
        
        ## prepare pipe
#         numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
#         preprocessor = ColumnTransformer(remainder="passthrough",
#                                          transformers=[("num", numeric_transformer, sub_cols)])
#         pipe = Pipeline(steps=[("preprocessor", preprocessor),
#                                ("classifier", model)])
#         # GroupKFold, k = min(4, length of unique value)
#         skf = StratifiedKFold(n_splits= min(4, len(sub_User_other.unique())))
#         cv_idx = list(skf.split(sub_X_other, sub_y_other, sub_User_other))
#         # init grid
#         grid = GridSearchCV(pipe, param_grid=param_grid, cv=cv_idx, 
#                             scoring=make_scorer(accuracy_score), 
#                             iid=False, n_jobs=-1)
#         le = LabelEncoder()
#         # label encode y incase there are class missing
#         sub_y_other = le.fit_transform(sub_y_other)
#         ## start training
#         b_time = time.time()
#         grid.fit(sub_X_other, sub_y_other)
#         e_time = time.time()
#         print(f"took {round(e_time - b_time, ndigits=2)}s")
#         # refit using the best hyperparameters
#         pipe.set_params(**grid.best_params_)
#         pipe.fit(sub_X_other, sub_y_other)
#         reduced_model["model"].append(pipe)

#         ## predict y, transform back and save
#         sub_y_test_pred = pipe.predict(sub_X_test)
#         print(f"accuracy: {accuracy_score(le.transform(sub_y_test), sub_y_test_pred)}")
#         sub_y_test_pred = pd.DataFrame(le.inverse_transform(sub_y_test_pred), 
#                                        columns=['sub_y_test_pred'], index=sub_y_test.index)
#         all_y_test_pred = all_y_test_pred.append(sub_y_test_pred)
        
#     all_y_test_pred = all_y_test_pred.sort_index()
#     y_test = y_test.sort_index()
#     total_acc = accuracy_score(y_test, all_y_test_pred)
#     return reduced_model, total_acc

In [326]:
y = df['User']
X = df.iloc[:, 2:]
stratify_other_test_ref = df['Class'].astype(str) + "&" + df['User'].astype(str)
param_grid_logi = {'classifier__C': 1/np.logspace(-3,5,3)}
model_logi = LogisticRegression(penalty='l1', solver='saga', max_iter = 10000, multi_class='auto')
count = 0
for s in range(20):
    count += reduced_feature_training_user(X, y, stratify_other_test_ref, model_logi, param_grid_logi, 42+s)
print()
print(f"split with only 1 class occurred {count} out of 20")

there are 10 unique missing value patterns.
there are 10 unique missing value patterns.
there are 10 unique missing value patterns.
there are 10 unique missing value patterns.
there are 10 unique missing value patterns.
there are 10 unique missing value patterns.
there are 10 unique missing value patterns.
there are 10 unique missing value patterns.
there are 10 unique missing value patterns.
there are 10 unique missing value patterns.
there are 10 unique missing value patterns.
there are 10 unique missing value patterns.
there are 10 unique missing value patterns.
there are 10 unique missing value patterns.
there are 10 unique missing value patterns.
there are 10 unique missing value patterns.
there are 10 unique missing value patterns.
there are 10 unique missing value patterns.
there are 10 unique missing value patterns.
there are 10 unique missing value patterns.

split with only 1 class occurred 7 out of 20
