### Dipendenze

In [None]:
from xgboost import XGBClassifier
import pandas as pd
import os
from os import listdir
from os.path import isfile, join
import re
import time
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np

In [None]:
#RANDOM_STATE_LIST = [int(i*11) for i in range(1,16)]
RANDOM_STATE_LIST = [42,123,456]
ROW_TIME = 4 #secondi di dati riassunti in una riga del dataframe
SAVE = False

mypath = os.getcwd() + '/data/data_total/'
os.makedirs(mypath[:-1], exist_ok=True)

DATASET = 'MultiPositionWearable' #'MultiPositionWearable', 'selfBACK', '?'

if DATASET == 'MultiPositionWearable':
  NOMI_FILE = {
    'baseline': 'MultiPosition_wearable_modelXGBtotal_baseline',
    'modello_base': 'MultiPosition_wearable_modelXGBtotal_base',
    'modello_varianza': 'MultiPosition_wearable_modelXGBtotal_varianza',
    'cartella_dati': 'MultiPosition_wearable_processed_data'
  }
elif DATASET == 'selfBACK':
  NOMI_FILE = {
    'baseline': 'selfBACK_modelXGBtotal_baseline',
    'modello_base': 'selfBACK_modelXGBtotal_base',
    'modello_varianza': 'selfBACK_modelXGBtotal_varianza',
    'cartella_dati': 'selfBACK_processed_data'
  }
elif DATASET == '?':
  NOMI_FILE = {
    'baseline': '?_modelXGBtotal_baseline',
    'modello_base': '?_modelXGBtotal_base',
    'modello_varianza': '?_modelXGBtotal_varianza',
    'cartella_dati': '?_processed_data'
  }
else:
    raise ValueError("DATASET non valido. Scegliere tra 'MultiPositionWearable', 'selfBACK' o '?'")

### Carica Dati per Modello

In [None]:
mypath_carica = os.getcwd() + '/data/' + NOMI_FILE['cartella_dati'] + '/'
file_pattern = 'grouped_data.*'

file_list = [
    f for f in listdir(mypath_carica)
    if (isfile(join(mypath_carica, f)) and
               re.compile(file_pattern).match(f))]
df_data = pd.DataFrame()
for file in file_list:
    df = pd.read_csv(mypath_carica + file, header=0).iloc[:,1:] if DATASET == 'default' else pd.read_csv(mypath_carica + file, header=0)
    df_data = pd.concat([df_data, df]).reset_index(drop=True)
def set_labels(df):
    label_encoder = LabelEncoder()
    df['label'] = label_encoder.fit_transform(df['Activity'])
    return df, label_encoder.classes_
df_data, labels = set_labels(df_data)


### Bilanciamento Dati
Tutti gli utenti avranno lo stesso numero di dati di quello che ne ha meno

In [None]:
def balance_user_labels(df):
    min_count = df.groupby(['Userid', 'position', 'label']).size().min()

    def sample_group(group):
        return group.head(min_count)

    balanced_df = df.groupby(['Userid', 'position', 'label']).apply(sample_group).reset_index(drop=True)

    return balanced_df

df_data = balance_user_labels(df_data)

In [None]:
def strip_Spaces(df):
    df.columns = df.columns.str.strip()
    return df
df_data = strip_Spaces(df_data)

### Per quanto uso multipli sensori

In [None]:
def get_features_for_each_sensor(df_data, positions):
    df_final = pd.DataFrame()  # This will be the final DataFrame including features and labels
    labels_columns = []  # To keep track of the names of the 'label' columns for each position

    df_data = df_data.reset_index(drop=True)

    for position in positions:
        # Prepare feature columns for the current position
        df_data_pos = df_data[df_data['position'] == position].drop(columns=['label', 'position']).rename(
            columns=lambda x: x + '_' + position).reset_index(drop=True)

        # Prepare label column for the current position
        label_col_name = f'label_{position}'
        df_labels = df_data[df_data['position'] == position]['label'].reset_index(drop=True).to_frame(name=label_col_name)
        labels_columns.append(label_col_name)
        # Concatenate feature and label columns
        df_combined = pd.concat([df_data_pos, df_labels], axis=1)
        df_final = pd.concat([df_final, df_combined], axis=1)

    # Filter rows where all label columns have the same value
    mask = df_final.apply(lambda row: all(row[col] == row[labels_columns[0]] for col in labels_columns), axis=1)
    df_filtered = df_final[mask]

    # Optionally, you might want to drop redundant label columns and keep just one
    df_filtered = df_filtered.drop(columns=labels_columns[1:]).rename(columns={labels_columns[0]: 'label'})

    return df_filtered.dropna()


In [None]:
def duplicaRighePesi(df_moved, weight, varianza):
    if not varianza:
        df_moved = df_moved.loc[np.repeat(df_moved.index, int(weight))].reset_index(drop=True)
    elif varianza and weight > 1:
        df_moved['is_original'] = True
        repeated_part = df_moved.loc[np.repeat(df_moved.index, int(weight) - 1)].copy()
        repeated_part['is_original'] = False
        df_moved = pd.concat([df_moved, repeated_part], ignore_index=True)

        feature_cols = df_moved.columns.difference(['label', 'is_original'])
        feature_cols = df_moved[feature_cols].select_dtypes(include=[np.number]).columns
        df_moved[feature_cols] = df_moved[feature_cols].astype(float)

        df_moved.loc[~df_moved['is_original'], feature_cols] *= np.random.uniform(0.99, 1.01, size=df_moved.loc[~df_moved['is_original'], feature_cols].shape)

        df_moved = df_moved.drop('is_original', axis=1)

    return df_moved


In [None]:
def get_train_test_data(df_data, user=None, random_state=42, percentage=None, weight = None, varianza = False):
    all_features = [item for item in df_data.columns if
                    item not in ['Timestamp', 'Userid', 'UserAge', 'UserSex', 'UserHeight', 'UserWeight', 'Activity',
                                 'position', 'label', 'MagnxEnergy', 'MagnyEnergy', 'MagnzEnergy', 'MagnMagnitude',
                                 'MagnMagnitudeMean', 'MagnMagnitudeMin', 'MagnMagnitudeMax', 'MagnMagnitudeStd',
                                 'MagnMagnitudeEnergy']]
    all_features = [item for item in all_features if not re.match(r'.*MagnMagnitude.*', item)]
    features = [item for item in all_features if re.match(r'.*Magnitude.*', item)]
    # ALLA FINE USO SOLO MAGNITUDI DI ACC E GYRO
    positions = list(df_data['position'].unique())
    df_train = df_data[df_data['Userid'] != user].reset_index(drop=True)
    df_test = df_data[df_data['Userid'] == user].reset_index(drop=True)

    df_test_list = []
    df_testFISSO_list = []

    for (label, position), group in df_test.groupby(['label', 'position']):
        split_idx = int(len(group) * 0.8)
        group = group.sort_index()
        df_test_list.append(group.iloc[:split_idx])
        df_testFISSO_list.append(group.iloc[split_idx:])

    df_test = pd.concat(df_test_list).reset_index(drop=True)
    df_testFISSO = pd.concat(df_testFISSO_list).reset_index(drop=True)




    #sposto le righe
    moved_indices = []
    for label_value in df_test['label'].unique():
        df_test_label = df_test[df_test['label'] == label_value]
        for position_value in df_test_label['position'].unique():
            df_test_label_position = df_test_label[df_test_label['position'] == position_value]
            num_to_move = int(len(df_test_label_position) * percentage)
            if num_to_move > 0:
                indices_to_move = df_test_label_position.sample(n=num_to_move, random_state=random_state).index.tolist()
                moved_indices.extend(indices_to_move)
    righe_mosse = len(moved_indices)

    if len(positions) > 1:
        righe_mosse = righe_mosse / len(positions)
        df_train = get_features_for_each_sensor(df_train[features + ['position', 'label']], positions)
        df_testFISSO  = get_features_for_each_sensor(df_testFISSO[features + ['position', 'label']], positions)
        print("fisse ",df_testFISSO['label'].value_counts())


    if moved_indices:
        df_moved = df_test.loc[moved_indices].copy()
        if len(positions) > 1:
            df_moved = get_features_for_each_sensor(df_moved[features + ['position', 'label']], positions)
            print("usabili ",df_moved['label'].value_counts())
        df_moved = duplicaRighePesi(df_moved, weight, varianza)
        df_train = pd.concat([df_train, df_moved], ignore_index=True).reset_index(drop=True)

    if len(positions) > 1:
        X_train = df_train.drop(columns=['label'])
        X_test = df_testFISSO.drop(columns=['label'])
    else:
        X_train = df_train[features]
        X_test = df_testFISSO[features]
    y_train = df_train['label']
    y_test  = df_testFISSO['label']

    return X_train, X_test, y_train, y_test, righe_mosse

In [None]:
def prendiMax(df, position, random_states):
    if not isinstance(random_states, list):
        random_states = [random_states]

    df_pos = df[(df['position'] == position) & (df['randomState'].isin(random_states))]
    grouped = df_pos.groupby(['randomState', 'timeUsed'])['f1-score'].mean()
    max_medie = grouped.groupby('randomState').max()

    return max_medie.to_dict()

### Alleno Modello

In [None]:
def train_model(X_train, X_test, y_train, random_state):
    xgb = XGBClassifier(
        n_estimators=150,
        random_state=random_state,
        n_jobs=-1
    )

    xgb.fit(X_train, y_train)
    y_pred = xgb.predict(X_test)
    
    return y_pred

### Divido i dati

In [None]:
def k_fold_cross_validation(position, df_data, weight_list=None, varianza = False, lista_percentuali = None):
    global df_f1_score
    global f1_s_max_dict
    global baseCalcolata
    all_sensors = len(position) > 1
    labels = df_data['Activity'].unique()
    
    if weight_list is None:
        weight_list = [5,10,25]
        #weight_list = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000, 1500, 2000, 2500, 5000, 7500, 10000]
    if lista_percentuali is None:
        #lista_percentuali = [i / 100 for i in range(0, 101, 5)]
        lista_percentuali = [1]
        
    for rand_state in RANDOM_STATE_LIST:
        for peso in weight_list:
            for percentuale_nuovo_train in lista_percentuali:
                print(",".join(position)+" stato "+str(rand_state)+" "+str(peso)+"w "+str(int(percentuale_nuovo_train*100))+"%")
                for k in df_data['Userid'].unique():
                    X_train, X_test, y_train, y_test, num_dati_spostati = \
                        get_train_test_data(df_data[df_data['position'].isin(position)], user=k, random_state = rand_state, percentage=float(percentuale_nuovo_train), weight = float(peso), varianza = varianza)

                    start = time.perf_counter()
                    y_pred = train_model(X_train, X_test, y_train,random_state = rand_state)
                    end = time.perf_counter()
                    durata = end - start
                    class_report = classification_report(y_test, y_pred, output_dict=True,zero_division=0)
                    
                    for label_idx, label in zip(y_train.unique(), labels):
                        df = pd.DataFrame()
                        df['label'] = [label]
                        df['timeUsed'] = [num_dati_spostati * ROW_TIME]
                        df['percentage'] = [int(percentuale_nuovo_train*100)]
                        df['weight'] = [float(peso)]
                        df['time'] = [round(durata, 2)]
                        df['randomState'] = [rand_state]
                        df['position'] = ['all sensors'] if all_sensors else position

                        key_formats = [str(label_idx), str(float(label_idx)), str(int(label_idx))]
                        for key in key_formats:
                            try:
                                df['f1-score'] = [class_report[key]['f1-score']]
                                df['precision'] = [class_report[key]['precision']]
                                df['recall'] = [class_report[key]['recall']]
                                break
                            except KeyError:
                                continue

                        df_f1_score = pd.concat([df_f1_score, df], axis=0).reset_index(drop=True)

                if not baseCalcolata:
                    continue
                if all_sensors or position[0] in df_data['position'].unique():
                    df_appena_calcolato = df_f1_score[df_f1_score['weight'] == peso]
                    df_appena_calcolato = df_appena_calcolato[df_appena_calcolato['percentage'] == int(percentuale_nuovo_train*100)]

                    if all_sensors:
                        pos_key = 'all sensors'
                        f1_s_max = f1_s_max_dict['all sensors']
                    else:
                        pos_key = position[0]
                        f1_s_max = f1_s_max_dict[position[0]] 

                    f1_s_mifermo = prendiMax(df_appena_calcolato, pos_key, rand_state)
                    if f1_s_mifermo[rand_state] >= f1_s_max[rand_state]:
                        print(f"  stop a {int(percentuale_nuovo_train * 100)}%({num_dati_spostati * ROW_TIME}) per peso {peso} "
                                f"in quanto {f1_s_mifermo[rand_state]} Ã¨ maggiore del max a peso 1 ({f1_s_max[rand_state]})")
                        break

### Caso Base
Peso 1, fa da ottimizzatore per i veri modelli con tutti i vari pesi facendoli fermare quando superano il massimo di questo

In [None]:
baseCalcolata = False

df_f1_score = pd.DataFrame()
f1_s_max_dict = {}

mypath = os.getcwd() + '/data/data_total/'
file_pattern = NOMI_FILE['baseline']+'.csv'
esiste_base = [f for f in listdir(mypath) if (isfile(join(mypath, f)) and 
               re.compile(file_pattern).match(f))]

if esiste_base:
    df_f1_score = pd.read_csv(mypath + NOMI_FILE['baseline']+'.csv')
else:
    print("Position: all sensors")
    position = [pos for pos in df_data['position'].unique()]
    k_fold_cross_validation(position, df_data, weight_list=[1])

    for position in df_data['position'].unique():
       print("Position: ", position)
       k_fold_cross_validation([position], df_data, weight_list=[1])
    if SAVE:   
        df_f1_score.to_csv(mypath + NOMI_FILE['baseline']+'.csv')

for position in df_data['position'].unique():
   f1_s_max_dict[position] = prendiMax(df_f1_score, position, RANDOM_STATE_LIST)
   print("f1-score ", position, ":", f1_s_max_dict[position])

f1_s_max_dict['all sensors'] = prendiMax(df_f1_score, 'all sensors', RANDOM_STATE_LIST)
print("f1-score all sensors:", f1_s_max_dict['all sensors'])


pesoBaseData = df_f1_score.copy()

baseCalcolata = True

### Calcolo modello base

In [None]:
mypath = os.getcwd() + '/data/data_total/'
file_pattern = NOMI_FILE['modello_base']+'.csv'
esiste_base = [f for f in listdir(mypath) if (isfile(join(mypath, f)) and 
               re.compile(file_pattern).match(f))]

if not esiste_base:
    df_f1_score = pd.DataFrame()

    for position in df_data['position'].unique():
        print("Position: ", position)
        k_fold_cross_validation([position], df_data)

    print("Position: all sensors")
    position = [pos for pos in df_data['position'].unique()]
    k_fold_cross_validation(position, df_data)

    if SAVE:   
        baseData = df_f1_score.copy()
        baseData = pd.concat([baseData, pesoBaseData]).reset_index(drop=True)
        baseData.to_csv(mypath + NOMI_FILE['modello_base']+'.csv')

### Calcolo modello con varianza
Varianza definita come il moltiplicare ogni riga ripetuta per un valore compreso tra 0.99 e 1.01

In [None]:
mypath = os.getcwd() + '/data/data_total/'
file_pattern = NOMI_FILE['modello_varianza']+'.csv'
esiste_base = [f for f in listdir(mypath) if (isfile(join(mypath, f)) and 
               re.compile(file_pattern).match(f))]

if not esiste_base:

    df_f1_score = pd.DataFrame()

    for position in df_data['position'].unique():
        print("Position: ", position)
        k_fold_cross_validation([position], df_data, varianza = True)

    print("Position: all sensors")
    position = [pos for pos in df_data['position'].unique()]
    k_fold_cross_validation(position, df_data, varianza = True)

    if SAVE:   
        varianzaData = df_f1_score.copy()
        varianzaData = pd.concat([varianzaData, pesoBaseData]).reset_index(drop=True)
        varianzaData.to_csv(mypath + NOMI_FILE['modello_varianza']+'.csv')