In [95]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns
sns.set()

In [96]:
#import data

data = pd.read_csv('processed_data/fe_data.csv')
data_leg = pd.read_csv('processed_data/fe_legsensor_data.csv')
data_arm = pd.read_csv('processed_data/fe_armsensor_data.csv')

In [97]:
#drop 'index col'
data = data.iloc[:,1:]
data_leg = data_leg.iloc[:,1:]
data_arm = data_arm.iloc[:,1:]

# 2. Sex Differentiation - Machine Learning Approach

In [98]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5767 entries, 0 to 5766
Data columns (total 64 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ax_mean       5767 non-null   float64
 1   ay_mean       5767 non-null   float64
 2   az_mean       5767 non-null   float64
 3   aT_mean       5767 non-null   float64
 4   ax_ad_mean    5767 non-null   float64
 5   ay_ad_mean    5767 non-null   float64
 6   az_ad_mean    5767 non-null   float64
 7   aT_ad_mean    5767 non-null   float64
 8   ax_min        5767 non-null   float64
 9   ay_min        5767 non-null   float64
 10  az_min        5767 non-null   float64
 11  aT_min        5767 non-null   float64
 12  ax_max        5767 non-null   float64
 13  ay_max        5767 non-null   float64
 14  az_max        5767 non-null   float64
 15  aT_max        5767 non-null   float64
 16  ax_std        5767 non-null   float64
 17  ay_std        5767 non-null   float64
 18  az_std        5767 non-null 

In [99]:
#Whole data
sd_x_data = data.drop('sex', axis=1)
sd_y_data =  data['sex']

#Leg data
sd_x_data_leg = data_leg.drop('sex', axis=1)
sd_y_data_leg =  data_leg['sex']

#arm data
sd_x_data_arm = data_arm.drop('sex', axis=1)
sd_y_data_arm =  data_arm['sex']

In [100]:
#one hot encoding for 'activity' data
activity_encoder = OneHotEncoder()
activity_reshaped = np.array(sd_x_data['activity']).reshape(-1,1)
activity_data = activity_encoder.fit_transform(activity_reshaped)

activity_df = pd.DataFrame(activity_data.toarray(), columns=['jog', 'run', 'stand', 'walk'])
activity_df

Unnamed: 0,jog,run,stand,walk
0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0
...,...,...,...,...
5762,0.0,1.0,0.0,0.0
5763,0.0,1.0,0.0,0.0
5764,0.0,1.0,0.0,0.0
5765,0.0,1.0,0.0,0.0


In [101]:
#one hot encoding-> leg sensor data
activity_reshaped_leg = np.array(sd_x_data_leg['activity']).reshape(-1,1)
activity_data_leg = activity_encoder.fit_transform(activity_reshaped_leg)
activity_df_leg = pd.DataFrame(activity_data_leg.toarray(), columns=['jog', 'run', 'stand', 'walk'])
activity_df_leg

Unnamed: 0,jog,run,stand,walk
0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0
...,...,...,...,...
2851,0.0,1.0,0.0,0.0
2852,0.0,1.0,0.0,0.0
2853,0.0,1.0,0.0,0.0
2854,0.0,1.0,0.0,0.0


In [102]:
#one hot encoding-> arm sensor data
activity_reshaped_arm = np.array(sd_x_data_arm['activity']).reshape(-1,1)
activity_data_arm = activity_encoder.fit_transform(activity_reshaped_arm)
activity_df_arm = pd.DataFrame(activity_data_arm.toarray(), columns=['jog', 'run', 'stand', 'walk'])
activity_df_arm

Unnamed: 0,jog,run,stand,walk
0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0
...,...,...,...,...
2906,0.0,1.0,0.0,0.0
2907,0.0,1.0,0.0,0.0
2908,0.0,1.0,0.0,0.0
2909,0.0,1.0,0.0,0.0


In [103]:
sd_x_data = sd_x_data.drop('activity', axis=1)
sd_x_data = pd.concat([sd_x_data,activity_df], axis=1)

sd_x_data_leg = sd_x_data_leg.drop('activity', axis=1)
sd_x_data_leg = pd.concat([sd_x_data_leg,activity_df_leg], axis=1)

sd_x_data_arm = sd_x_data_arm.drop('activity', axis=1)
sd_x_data_arm = pd.concat([sd_x_data_arm,activity_df_arm], axis=1)

In [104]:
pos_encoder = LabelEncoder()
pos_encoder.fit(sd_x_data['sensor_pos'])

LabelEncoder()

In [105]:
#sensor encoding
sd_x_data['sensor_pos'] = pos_encoder.transform(sd_x_data['sensor_pos'])

sd_x_data_leg['sensor_pos'] = pos_encoder.transform(sd_x_data_leg['sensor_pos'])

sd_x_data_arm['sensor_pos'] = pos_encoder.transform(sd_x_data_arm['sensor_pos'])

In [106]:
sd_x_data = sd_x_data.drop('id', axis=1)
sd_x_data_leg = sd_x_data_leg.drop('id', axis=1)
sd_x_data_arm = sd_x_data_arm.drop('id', axis=1)

In [107]:
X_train, X_val, y_train, y_val = train_test_split(sd_x_data, sd_y_data, test_size= 0.2, random_state= 24)
X_train_leg, X_val_leg, y_train_leg, y_val_leg = train_test_split(sd_x_data_leg, sd_y_data_leg, test_size= 0.2, random_state= 24)
X_train_arm, X_val_arm, y_train_arm, y_val_arm = train_test_split(sd_x_data_arm, sd_y_data_arm, test_size= 0.2, random_state= 24)

In [108]:
#standardizing into scaler

scaler = StandardScaler()
scaler.fit(X_train)
X_train_sd = scaler.transform(X_train)
X_val_sd = scaler.transform(X_val)

X_train_leg_sd = scaler.transform(X_train_leg)
X_val_leg_sd = scaler.transform(X_val_leg)

X_train_arm_sd = scaler.transform(X_train_arm)
X_val_arm_sd = scaler.transform(X_val_arm)

# Logistic Regression

In [109]:
#Logistical Regression

lr_model = LogisticRegression(random_state=21)
lr_model.fit(X_train_sd,y_train)

lr_model_leg = LogisticRegression(random_state=21)
lr_model_leg.fit(X_train_leg_sd,y_train_leg)

lr_model_arm = LogisticRegression(random_state=21)
lr_model_arm.fit(X_train_arm_sd,y_train_arm)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LogisticRegression(random_state=21)

 - LR on whole FE dataset

In [110]:
y_pred = lr_model.predict(X_val_sd)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           F       0.68      0.68      0.68       572
           M       0.68      0.68      0.68       582

    accuracy                           0.68      1154
   macro avg       0.68      0.68      0.68      1154
weighted avg       0.68      0.68      0.68      1154



 - LR on Leg Sensor FE dataset

In [111]:
#leg
y_pred_leg = lr_model_leg.predict(X_val_leg_sd)
print(classification_report(y_val_leg, y_pred_leg))

              precision    recall  f1-score   support

           F       0.81      0.83      0.82       282
           M       0.83      0.81      0.82       290

    accuracy                           0.82       572
   macro avg       0.82      0.82      0.82       572
weighted avg       0.82      0.82      0.82       572



 - LR on Arm Sensor FE dataset

In [112]:
#arm
y_pred_arm = lr_model_arm.predict(X_val_arm_sd)
print(classification_report(y_val_arm, y_pred_arm))

              precision    recall  f1-score   support

           F       0.74      0.72      0.73       288
           M       0.73      0.76      0.75       295

    accuracy                           0.74       583
   macro avg       0.74      0.74      0.74       583
weighted avg       0.74      0.74      0.74       583



# Random Forests

In [113]:
#Random Forests

clf_model = RandomForestClassifier()
clf_model.fit(X_train_sd,y_train)

clf_model_leg = RandomForestClassifier()
clf_model_leg.fit(X_train_leg_sd,y_train_leg)

clf_model_arm = RandomForestClassifier()
clf_model_arm.fit(X_train_arm_sd,y_train_arm)

RandomForestClassifier()

 - LR on whole FE dataset

In [114]:
y_pred = clf_model.predict(X_val_sd)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           F       0.93      0.93      0.93       572
           M       0.93      0.93      0.93       582

    accuracy                           0.93      1154
   macro avg       0.93      0.93      0.93      1154
weighted avg       0.93      0.93      0.93      1154



 - LR on Leg Sensor FE dataset

In [115]:
#leg
y_pred_leg = clf_model_leg.predict(X_val_leg_sd)
print(classification_report(y_val_leg, y_pred_leg))

              precision    recall  f1-score   support

           F       0.95      0.97      0.96       282
           M       0.97      0.95      0.96       290

    accuracy                           0.96       572
   macro avg       0.96      0.96      0.96       572
weighted avg       0.96      0.96      0.96       572



 - LR on Arm Sensor FE dataset

In [116]:
#arm
y_pred_arm = clf_model_arm.predict(X_val_arm_sd)
print(classification_report(y_val_arm, y_pred_arm))

              precision    recall  f1-score   support

           F       0.92      0.94      0.93       288
           M       0.94      0.92      0.93       295

    accuracy                           0.93       583
   macro avg       0.93      0.93      0.93       583
weighted avg       0.93      0.93      0.93       583

