--- 

## <center> Project ML for Time Series 
### <center> Feature Selection: A Data Perspective (Part 2)
### <center> Classification Problem
<center>Work done by : 

##### <center> Ali HAIDAR: ali.haidar@polytechnique.edu
##### <center> Maya AWADA: maya.awada@ip-paris.fr 

---

#### Importing Libraries

In [None]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (classification_report, accuracy_score, recall_score, 
                             precision_score, f1_score, confusion_matrix, mean_absolute_error, 
                             r2_score, mean_squared_error, mean_absolute_percentage_error)
from tabulate import tabulate
import features_selection
import evaluation

warnings.filterwarnings("ignore")


#### Loading and Sampling Data

The MHealth dataset is a multi-variate time series dataset that deals with human behavior analysis based on multimodal body sensing. The dataset was found in the UCI machine learning repository. It contains body motion and vital signs recordings for 10 volunteers of various profile while performing 12 different physical activities. The gooal of this classification problem is to predict which activity the subject is doing provided the following features:

- ACX:	Acceleration From The Chest Sensor (X Axis)
- ACY:	Acceleration From The Chest Sensor (Y Axis)
- ACZ:	Acceleration From The Chest Sensor (Z Axis)
- ES1:	Electrocardiogram Signal (Lead 1)
- ES2:	Electrocardiogram Signal (Lead 2)
- ALX:	Acceleration From The Left-Ankle Sensor (X Axis)
- ALY:	Acceleration From The Left-Ankle Sensor (Y Axis)
- ALZ:	Acceleration From The Left-Ankle Sensor (Z Axis)
- GLX:	Gyroscope From The Left-Ankle Sensor (X Axis)
- GLY:	Gyroscope From The Left-Ankle Sensor (Y Axis)
- GLZ:	Gyroscope From The Left-Ankle Sensor (Z Axis)
- MLX:	Magnetometer From The Left-Ankle Sensor (X Axis)
- MLY:	Magnetometer From The Left-Ankle Sensor (Y Axis)
- MLZ:	Magnetometer From The Left-Ankle Sensor (Z Axis)
- ARX:	Acceleration From The Right-Lower-Arm Sensor (X Axis)
- ARY:	Acceleration From The Right-Lower-Arm Sensor (Y Axis)
- ARZ:	Acceleration From The Right-Lower-Arm Sensor (Z Axis)
- GRX:	Gyroscope From The Right-Lower-Arm Sensor (X Axis)
- GRY:	Gyroscope From The Right-Lower-Arm Sensor (Y Axis)
- GRZ:	Gyroscope From The Right-Lower-Arm Sensor (Z Axis)
- MRX:	Magnetometer From The Right-Lower-Arm Sensor (X Axis)
- MRY:	Magnetometer From The Right-Lower-Arm Sensor (Y Axis)
- MRZ:	Magnetometer From The Right-Lower-Arm Sensor (Z Axis)

In [None]:
df = pd.DataFrame()

for i in range(1,11):
    data = pd.read_csv("classification_data/mHealth_subject"+str(i)+".log", sep="\t", header=None, encoding="utf-8")
    # Sampling Stretgy: For each participant, we randomly select 50 examples of each physical activity
    data = data.groupby(data.iloc[:, -1], group_keys=False).apply(lambda x: x.sample(min(len(x), 50)))
    data = data.assign(subject = [i for j in range(data.shape[0])])
    df = df.append(data)

df.columns = ['acx', 'acy', 'acz', 'es1', 'es2', 'alx', 'aly', 'alz', 'glx', 'gly', 'glz', 'mlx', 'mly', 'mlz', 'arx', 'ary', 'arz', 'grx',
       'gry', 'grz', 'mrx', 'mry', 'mrz', 'Activity', 'Subject']


In [None]:
df

In [None]:
# Checking for missing values
df.isnull().values.any()

In [None]:
plt.figure(figsize=(5,3))
plt.title('Number of Samples per Activity')
plt.ylim(0, 600)
plt.xlabel("Activity")
plt.ylabel("Count")
df['Activity'].value_counts().plot.bar(rot=0)
plt.show()

In [None]:
# Giving Labels to our Activity Values
activities_label_dict = {
    0: "None",
    1: "Standing still",
    2: "Sitting and relaxing",
    3: "Lying down",
    4: "Walking",
    5: "Climbing stairs",
    6: "Waist bends forward",
    7: "Frontal elevation of arms",
    8: "Knees bending",
    9: "Cycling",
    10: "Jogging",
    11: "Running",
    12: "Jump front & back"
}

df = df.replace({"Activity": activities_label_dict})

In [None]:
plt.figure(figsize=(8,5))
round(df["Activity"].value_counts()/df.shape[0]*100,2).plot.pie(autopct= '%2.1f%%',label='')

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
# Encoding 
df = df.drop(['Subject'], axis=1)
le = LabelEncoder()
df['Activity'] = le.fit_transform(df['Activity'])

In [None]:
plt.subplots(figsize = (20, 15))
sns.heatmap(df.corr(), annot = True)

#### Train / Test Splitting

In [None]:
x = df.drop(['Activity'], axis=1) 
y = df['Activity'] 

In [None]:
# Scaling
ro_scaler = RobustScaler().fit(x)
x_scaled = pd.DataFrame(ro_scaler.transform(x), columns = ['acx', 'acy', 'acz', 'es1', 'es2', 'alx', 'aly', 'alz', 'glx', 'gly',
       'glz', 'mlx', 'mly', 'mlz', 'arx', 'ary', 'arz', 'grx', 'gry', 'grz',
       'mrx', 'mry', 'mrz'])


In [None]:
# Creating train and test sets
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.3, stratify=y) 

In [None]:
print('X Train Shape:', x_train.shape)
print('y Train Shape:', y_train.shape, '\n')
print('X Test Shape:', x_test.shape)
print('y Test Shape:', y_test.shape)

In [None]:
# Creating a dictionary that contains the name of the method and the scores and the features of these method
benchmark = {} 
models = [LogisticRegression(), KNeighborsClassifier(n_neighbors=5)]

### Without Features Selections

In [None]:
for model in models:
    best_score = evaluation.evaluate_model(x_train, y_train, model, stratify = y_train ,  test_size=0.33, number_of_states = 20, acc =  evaluation.eval_metrics)[1]
    best_features = x_train.columns
    benchmark[type(model).__name__ ] = dict({'w/o FS': dict({'test_score':best_score,  'features':best_features})})

## Feature Selection

### Greedy epsilon sequential feature

In [None]:
best_score, best_features = features_selection.greedy_features_selection(x_train, y_train, models, stratify = np.array(y_train),  number_of_states = 20, metric = evaluation.eval_metrics)

for i in range(len(models)):
    model = models[i]
    benchmark[type(model).__name__]['gsfs'] = {'test_score':best_score[i], 'features':best_features[i]}
    print("The best_score of " + type(model).__name__ + " = ", best_score[i])
    print("The best_features of " + type(model).__name__  +" = ", best_features[i])


### Feature Selection Using Laplace Score

In [None]:
best_score, best_features = features_selection.laplace_features_selection(x_train, y_train, x_test, models, y_train,  number_of_states = 20, metric = evaluation.eval_metrics)

for i in range(len(models)):
    model = models[i]
    benchmark[type(model).__name__]['Laplace'] = {'test_score':best_score[i], 'features':best_features[i]}
    print("The best_score of " + type(model).__name__ + " = ", best_score[i])
    print("The best_features of " + type(model).__name__  +" = ", best_features[i])


##### Feature Selection Using Fisher Score

In [None]:
best_score, best_features = features_selection.fisher_feature_selection(x_train, y_train, x_test, models, y_train,  number_of_states = 20, metric = evaluation.eval_metrics)

for i in range(len(models)):
    model = models[i]
    benchmark[type(model).__name__]['Fisher'] = {'test_score':best_score[i], 'features':best_features[i]}
    print("The best_score of " + type(model).__name__ + " = ", best_score[i])
    print("The best_features of " + type(model).__name__  +" = ", best_features[i])

##### Feature Selection Using MRMR

In [None]:
best_score, best_features = features_selection.MRMR_features_selection(x_train, y_train, x_test, models, y_train,  number_of_states = 20, metric = evaluation.eval_metrics)

for i in range(len(models)):
    model = models[i]
    benchmark[type(model).__name__]['MRMR'] = {'test_score':best_score[i], 'features':best_features[i]}
    print("The best_score of " + type(model).__name__ + " = ", best_score[i])
    print("The best_features of " + type(model).__name__  +" = ", best_features[i])

##### Feature Selection Using CIFE

In [None]:
best_score, best_features = features_selection.CIFE_features_selection(x_train, y_train, x_test, models, y_train,  number_of_states = 20, metric = evaluation.eval_metrics)

for i in range(len(models)):
    model = models[i]
    benchmark[type(model).__name__]['CIFE'] = {'test_score':best_score[i], 'features':best_features[i]}
    print("The best_score of " + type(model).__name__ + " = ", best_score[i])
    print("The best_features of " + type(model).__name__  +" = ", best_features[i])



### Results

In [None]:
results = pd.DataFrame(columns=['model', 'w/o FS', 'Laplace', 'MRMR', 'Fisher', 'CIFE', 'gsfs'])
cols = ['w/o FS', 'Laplace', 'MRMR', 'Fisher', 'CIFE', 'gsfs']
for i in benchmark.keys():
    res = [i]
    for z in cols:
 
        if(z in benchmark[i]):
            res.append(benchmark[i][z]['test_score'][0])
        else:
            res.append(np.nan)
    results.loc[len(results)] = res

In [29]:
print(tabulate(results, headers='keys', tablefmt='fancy_grid'))

╒════╤══════════════════════╤══════════╤═══════════╤══════════╤══════════╤══════════╤══════════╕
│    │ model                │   w/o FS │   Laplace │     MRMR │   Fisher │     CIFE │     gsfs │
╞════╪══════════════════════╪══════════╪═══════════╪══════════╪══════════╪══════════╪══════════╡
│  0 │ LogisticRegression   │ 0.608589 │  0.60982  │ 0.608589 │ 0.609088 │ 0.608589 │ 0.599414 │
├────┼──────────────────────┼──────────┼───────────┼──────────┼──────────┼──────────┼──────────┤
│  1 │ KNeighborsClassifier │ 0.723469 │  0.746205 │ 0.789148 │ 0.814647 │ 0.723469 │ 0.812088 │
╘════╧══════════════════════╧══════════╧═══════════╧══════════╧══════════╧══════════╧══════════╛


In [None]:
freq = dict()
for i in benchmark.keys():
    for j in benchmark[i].keys():
    
        if(i not in freq):
            freq[i] = dict()
        if(j not in freq[i]):
            freq[i][j] = 0
        freq[i][j] += (len(benchmark[i][j]['features'])/x_train.shape[1])*100


In [28]:
freq_data = pd.DataFrame(freq).T
freq_data

Unnamed: 0,w/o FS,gsfs,Laplace,Fisher,MRMR,CIFE
LogisticRegression,100.0,60.869565,95.652174,95.652174,100.0,100.0
KNeighborsClassifier,100.0,60.869565,91.304348,60.869565,26.086957,100.0
