# **Support Vector Machine**

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from imblearn.under_sampling import RandomUnderSampler
from sklearn import svm, datasets
import sklearn.model_selection as model_selection
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

### **Preprocessing**

In [2]:
# Read the combined dataset
combined_df = pd.read_csv("combined_data_with_id_ordered.csv")

# Check for missing values
missing_values = combined_df.isnull().sum()
print("Missing Values:\n", missing_values)


# Convert 'timestamp' column to datetime format
combined_df['timestamp'] = pd.to_datetime(combined_df['timestamp'])

# Perform feature scaling
columns_to_scale = ['back_x', 'back_y', 'back_z', 'thigh_x', 'thigh_y', 'thigh_z']
scaler = StandardScaler()
combined_df[columns_to_scale] = scaler.fit_transform(combined_df[columns_to_scale])


# Display the first few rows of the modified DataFrame
print("\nModified DataFrame:")
print(combined_df.head())

# Function to detect and remove extreme outliers using the IQR method with a higher multiplier
def remove_extreme_outliers_iqr(df, columns, multiplier=3.0):
    for column in columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - multiplier * IQR
        upper_bound = Q3 + multiplier * IQR
        df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df

# Apply the outlier removal function to the relevant columns with a higher multiplier
columns_to_check = columns_to_scale  # The same columns that were scaled
original_shape = combined_df.shape
cleaned_df = remove_extreme_outliers_iqr(combined_df, columns_to_check, multiplier=3.0)
cleaned_shape = cleaned_df.shape

# Display the first few rows of the cleaned DataFrame
print("\nCleaned DataFrame:")
print(cleaned_df.head())

# Display the number of rows before and after removing outliers
print("\nOriginal number of rows:", original_shape[0])
print("Number of rows after removing outliers:", cleaned_shape[0])





FileNotFoundError: [Errno 2] No such file or directory: 'combined_data_with_id_ordered.csv'

## **SVM with a sample**

In [4]:
num_participants = 22
sample_size =30000

# Calculate the sample size per participant
sample_size_per_participant = sample_size // num_participants

# Perform stratified sampling
sampled_df = cleaned_df.groupby('id', group_keys=False).apply(lambda x: x.sample(min(len(x), sample_size_per_participant)))

# If the total sampled rows are less than the desired sample size, sample the remaining randomly
if len(sampled_df) < sample_size:
    remaining_sample_size = sample_size - len(sampled_df)
    remaining_sample = cleaned_df.drop(sampled_df.index).sample(remaining_sample_size)
    sampled_df = pd.concat([sampled_df, remaining_sample])

# Display the number of rows in the sampled DataFrame
print("\nNumber of rows in the sampled DataFrame:", len(sampled_df))

# Display the number of each class in the sampled DataFrame
if 'label' in sampled_df.columns:
    class_counts = sampled_df['label'].value_counts()
    print("\nNumber of instances in each class after sampling:")
    print(class_counts)
else:
    print("\n'Class' column not found in the DataFrame.")


Number of rows in the sampled DataFrame: 300000

Number of instances in each class after sampling:
label
7      126294
1       73017
6       35175
13      27872
3       11995
2       11090
4        3967
14       3488
5        3270
130      2828
140       601
8         403
Name: count, dtype: int64


In [5]:
# Select features and target
features = ['back_x', 'back_y', 'back_z', 'thigh_x', 'thigh_y', 'thigh_z']
X = sampled_df[features]
y = sampled_df['label']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### **ovr (One-vs-Rest)**

In [6]:
rbf = svm.SVC(kernel='rbf', gamma=2, C=2, decision_function_shape = "ovr").fit(X_train, y_train)

In [7]:
rbf_pred = rbf.predict(X_test)
rbf_accuracy = accuracy_score(y_test, rbf_pred)

print('Accuracy (RBF Kernel) on ovr: ', "%.2f" % (rbf_accuracy*100))
print("F1 on ovr:", f1_score(y_test, rbf_pred, average='weighted'))
print(classification_report(y_test, rbf_pred))

# 0.5 / 0.1 Accuracy (RBF Kernel):  85.79
# 1 / 1 Accuracy (RBF Kernel):  87.23
# 0.5 / 1 Accuracy (RBF Kernel):  86.28
# 1.5 / 1.5 Accuracy (RBF Kernel):  87.65
# 2 / 2 Accuracy (RBF Kernel):  87.74
# 2.5 / 2.5 Accuracy (RBF Kernel):  87.69
# 2.5 / 2 Accuracy (RBF Kernel):  87.64
# 1.5 / 2 Accuracy (RBF Kernel):  87.68
# 2 / 1.5 Accuracy (RBF Kernel):  87.69
# 2.1 /2.1 87.74
# 2.2 / 2.2 87.67

Accuracy (RBF Kernel) on ovr:  86.42
F1 on ovr: 0.8526058733503068
              precision    recall  f1-score   support

           1       0.80      0.85      0.83     21986
           2       0.83      0.76      0.79      3273
           3       0.41      0.17      0.24      3637
           4       0.42      0.21      0.28      1213
           5       0.29      0.11      0.16      1006
           6       0.74      0.91      0.82     10396
           7       0.99      0.99      0.99     37907
           8       0.93      0.89      0.91       111
          13       0.83      0.86      0.84      8368
          14       0.65      0.50      0.57      1081
         130       0.57      0.53      0.55       839
         140       0.61      0.52      0.56       183

    accuracy                           0.86     90000
   macro avg       0.67      0.61      0.63     90000
weighted avg       0.85      0.86      0.85     90000



### **ovo (One-vs-One)**

In [8]:
rbf = svm.SVC(kernel='rbf', gamma=2, C=20, decision_function_shape = "ovo").fit(X_train, y_train)


In [9]:
rbf_pred = rbf.predict(X_test)
rbf_accuracy = accuracy_score(y_test, rbf_pred)

print('Accuracy (RBF Kernel) on ovo: ', "%.2f" % (rbf_accuracy*100))
print("F1 on ovo :", f1_score(y_test, rbf_pred, average='weighted'))
print(classification_report(y_test, rbf_pred))

Accuracy (RBF Kernel) on ovo:  86.42
F1 on ovo : 0.8526058733503068
              precision    recall  f1-score   support

           1       0.80      0.85      0.83     21986
           2       0.83      0.76      0.79      3273
           3       0.41      0.17      0.24      3637
           4       0.42      0.21      0.28      1213
           5       0.29      0.11      0.16      1006
           6       0.74      0.91      0.82     10396
           7       0.99      0.99      0.99     37907
           8       0.93      0.89      0.91       111
          13       0.83      0.86      0.84      8368
          14       0.65      0.50      0.57      1081
         130       0.57      0.53      0.55       839
         140       0.61      0.52      0.56       183

    accuracy                           0.86     90000
   macro avg       0.67      0.61      0.63     90000
weighted avg       0.85      0.86      0.85     90000



ALMOST SAME RESULT

----------------------------------------------------------------------------------

## **SVM on an undersampled data**

In [None]:
# Select features and target
features = ['back_x', 'back_y', 'back_z', 'thigh_x', 'thigh_y', 'thigh_z']
X = cleaned_df[features]
y = cleaned_df['label']

In [None]:
# Apply RandomUnderSampler to balance the dataset
under_sampler = RandomUnderSampler(random_state=42)
X_usampled, y_usampled = under_sampler.fit_resample(X, y)

class_counts = y_usampled.value_counts()
print(class_counts)


label
1      7808
2      7808
3      7808
4      7808
5      7808
6      7808
7      7808
8      7808
13     7808
14     7808
130    7808
140    7808
Name: count, dtype: int64


In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_usampled, y_usampled, test_size=0.3, random_state=42)

In [None]:
rbf = svm.SVC(kernel='rbf', gamma=5, C=20, decision_function_shape = "ovr").fit(X_train, y_train)


In [None]:
rbf_pred = rbf.predict(X_test)
rbf_accuracy = accuracy_score(y_test, rbf_pred)

print('Accuracy with undersampling (RBF Kernel): ', "%.2f" % (rbf_accuracy*100))
print("F1 Undersampling:", f1_score(y_test, rbf_pred, average='weighted'))
print(classification_report(y_test, rbf_pred))

# 2 /20 Accuracy with undersampling and sampling (RBF Kernel):  69.87 F1 : 0.6964611004613767 WITH good f1 class

Accuracy with undersampling (RBF Kernel):  68.10
F1 Undersampling: 0.6785805810812949
              precision    recall  f1-score   support

           1       0.45      0.41      0.43      2297
           2       0.56      0.80      0.66      2325
           3       0.50      0.48      0.49      2319
           4       0.54      0.51      0.53      2303
           5       0.52      0.45      0.48      2456
           6       0.72      0.79      0.75      2388
           7       0.97      0.98      0.98      2349
           8       0.98      0.96      0.97      2347
          13       0.65      0.61      0.63      2364
          14       0.73      0.65      0.69      2391
         130       0.76      0.72      0.74      2300
         140       0.79      0.80      0.79      2270

    accuracy                           0.68     28109
   macro avg       0.68      0.68      0.68     28109
weighted avg       0.68      0.68      0.68     28109

