In [1]:
import pandas as pd
from sktime.datasets import load_from_arff_to_dataframe, load_from_tsfile_to_dataframe
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import VarianceThreshold, SelectPercentile
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
import numpy as np
from scipy.stats import skew, kurtosis
from scipy.signal import find_peaks


# Loading ARFF files
dimensions = list(range(1, 7))
train_arff_data = {}
test_arff_data = {}

for dimension in dimensions:
    train_arff_data[f'dimension{dimension}'] = load_from_arff_to_dataframe(f'RacketSports/RacketSportsDimension{dimension}_TRAIN.arff')
    test_arff_data[f'dimension{dimension}'] = load_from_arff_to_dataframe(f'RacketSports/RacketSportsDimension{dimension}_TEST.arff')

# Preprocessing
train_data = pd.concat([train_arff_data[f'dimension{dim}'][0] for dim in dimensions], axis=1)
test_data = pd.concat([test_arff_data[f'dimension{dim}'][0] for dim in dimensions], axis=1)

def extract_features(data):
    means = data.applymap(np.mean)
    stds = data.applymap(np.std)
    return pd.concat([means, stds], axis=1)

train_data = pd.concat([extract_features(train_arff_data[f'dimension{dim}'][0]) for dim in dimensions], axis=1)
test_data = pd.concat([extract_features(test_arff_data[f'dimension{dim}'][0]) for dim in dimensions], axis=1)



train_labels = train_arff_data['dimension1'][1]
test_labels = test_arff_data['dimension1'][1]

# Encoding labels
le = LabelEncoder()
y_train = le.fit_transform(train_labels)
y_test = le.transform(test_labels)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train_data)
X_test_scaled = scaler.transform(test_data)

# Feature selection
selector = VarianceThreshold()
X_train_selected = selector.fit_transform(X_train_scaled)
X_test_selected = selector.transform(X_test_scaled)

# Hyperparameter tuning
random_forest_params = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'max_features': ['sqrt', 'log2']
}
svm_params = {
    'kernel': ['linear', 'rbf', 'poly'],
    'C': [0.1, 1, 10, 100]
}
gradient_boosted_params = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [3, 6, 10, 20],
    'learning_rate': [0.01, 0.1, 0.3]
}

grid_search_rf = GridSearchCV(RandomForestClassifier(), random_forest_params, scoring='accuracy', cv=5)
grid_search_svm = GridSearchCV(SVC(), svm_params, scoring='accuracy', cv=5)
grid_search_gb = GridSearchCV(XGBClassifier(), gradient_boosted_params, scoring='accuracy', cv=5)

# Train and evaluate algorithms
models = {
    'RandomForest': grid_search_rf,
    'SVM': grid_search_svm,
    'GradientBoosted': grid_search_gb
}

for model_name, model in models.items():
    model.fit(X_train_selected, y_train)
    y_pred = model.predict(X_test_selected)
    print(f"Model: {model_name}")
    print("Best Hyperparameters:", model.best_params_)
    print(classification_report(y_test, y_pred, target_names=le.classes_))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\n")

Model: RandomForest
Best Hyperparameters: {'max_depth': 10, 'max_features': 'log2', 'n_estimators': 50}
                      precision    recall  f1-score   support

     Badminton_Clear       0.60      0.65      0.62        43
     Badminton_Smash       0.63      0.55      0.59        40
Squash_BackhandBoast       0.90      0.82      0.86        34
Squash_ForehandBoast       0.77      0.86      0.81        35

            accuracy                           0.71       152
           macro avg       0.72      0.72      0.72       152
        weighted avg       0.71      0.71      0.71       152

Confusion Matrix:
 [[28 13  0  2]
 [17 22  0  1]
 [ 0  0 28  6]
 [ 2  0  3 30]]


Model: SVM
Best Hyperparameters: {'C': 1, 'kernel': 'rbf'}
                      precision    recall  f1-score   support

     Badminton_Clear       0.64      0.79      0.71        43
     Badminton_Smash       0.72      0.57      0.64        40
Squash_BackhandBoast       0.91      0.91      0.91        34
Squash_

In [41]:



def extract_all_features(series):
    features = []
    
    # Make sure the input is a numpy array
    if isinstance(series, pd.Series):
        series = series.array[0]
    
    # Statistical features
    features.append(np.mean(series))
    features.append(np.std(series))
    features.append(np.mean(np.abs(series - np.mean(series))))
    features.append(np.min(series))
    features.append(np.max(series))
    features.append(np.max(series) - np.min(series))
    features.append(np.median(series))
    features.append(np.median(np.abs(series - np.median(series))))
    features.append(np.percentile(series, 75) - np.percentile(series, 25))
    features.append(np.sum(series < 0))
    features.append(np.sum(series > 0))
    features.append(np.sum(series > np.mean(series)))
    
    peaks, _ = find_peaks(series)
    features.append(len(peaks))
    
    features.append(np.mean(np.square(series)))
    features.append(skew(series))
    features.append(kurtosis(series))

    
    
    return features

# Define column names for the features
column_names = ["mean", "std_dev", "mean_abs_dev", "min_value", "max_value", "range", "median", "median_abs_dev",
                "interquartile_range", "num_neg_values", "num_pos_values", "num_values_above_mean", "num_peaks",
                "signal_energy", "skewness", "kurtosis"]

# Apply feature extraction to train and test data
train_features_data = {}
test_features_data = {}

for dimension in dimensions:
    train_series, train_labels = train_arff_data[f'dimension{dimension}']
    test_series, test_labels = test_arff_data[f'dimension{dimension}']
    
    train_features = []
    for idx in range(len(train_series)):
        train_features.append(extract_all_features(train_series.iloc[idx]))
    
    test_features = []
    for idx in range(len(test_series)):
        test_features.append(extract_all_features(test_series.iloc[idx]))
    
    train_features_data[f'dimension{dimension}'] = (pd.DataFrame(train_features, columns=column_names), train_labels)
    test_features_data[f'dimension{dimension}'] = (pd.DataFrame(test_features, columns=column_names), test_labels)

    print(f"Dimension {dimension} Train Features:")
    print(train_features_data[f'dimension{dimension}'][0].head())
    print("\n")
    
    print(f"Dimension {dimension} Test Features:")
    print(test_features_data[f'dimension{dimension}'][0].head())
    print("\n")

Dimension 1 Train Features:
       mean    std_dev  mean_abs_dev  min_value  max_value      range  \
0 -2.633486   5.774056      3.726880 -22.661740   2.184308  24.846048   
1  2.358459  10.098880      8.227055 -13.186833  27.733688  40.920521   
2  4.966444   6.718684      5.048212  -5.944770  22.322626  28.267396   
3 -4.189178   8.585126      6.155735 -27.778690   5.507538  33.286228   
4  2.990686   8.187165      6.161131 -13.361450  26.323359  39.684809   

     median  median_abs_dev  interquartile_range  num_neg_values  \
0 -0.559298        1.404561             2.391752              20   
1  0.401548        7.256374            14.341795              13   
2  3.597653        2.859003             6.305525               4   
3 -0.902306        2.073702             3.848803              20   
4  0.683699        3.422732             7.262485              13   

   num_pos_values  num_values_above_mean  num_peaks  signal_energy  skewness  \
0              10                     24    