In [1]:
# DESCRIPTION

# You are supposed to detect whether the person is running or walking based on the sensor data collected from 
# iOS device. The dataset contains a single file which represents sensor data samples collected from accelerometer 
# and gyroscope from iPhone 5c in 10 seconds interval and ~5.4/second frequency.

# Objective: Practice classification based on Naive Bayes algorithm. Identify the predictors that can be influential.

In [2]:
# 1.Load the kinematics dataset as measured on mobile sensors from the file “run_or_walk.csv.”
# 2.List the columns in the dataset.
# 3.Let the target variable “y” be the activity, and assign all the columns after it to “x.”
# 4.Using Scikit-learn, fit a Gaussian Naive Bayes model and observe the accuracy.
# 5.Generate a classification report using Scikit-learn.
# 6.Repeat the model once using only the acceleration values as predictors and then using only the gyro values as predictors.
# 7.Comment on the difference in accuracy between both models.

In [3]:
# IMPORTS
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [5]:
# 1.Load the kinematics dataset as measured on mobile sensors from the file “run_or_walk.csv.”

# DATASET
df = pd.read_csv('run_or_walk.csv')

In [7]:
# DATA EXPLORATION
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88588 entries, 0 to 88587
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   date            88588 non-null  object 
 1   time            88588 non-null  object 
 2   username        88588 non-null  object 
 3   wrist           88588 non-null  int64  
 4   activity        88588 non-null  int64  
 5   acceleration_x  88588 non-null  float64
 6   acceleration_y  88588 non-null  float64
 7   acceleration_z  88588 non-null  float64
 8   gyro_x          88588 non-null  float64
 9   gyro_y          88588 non-null  float64
 10  gyro_z          88588 non-null  float64
dtypes: float64(6), int64(2), object(3)
memory usage: 7.4+ MB


In [8]:
df.describe()

Unnamed: 0,wrist,activity,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z
count,88588.0,88588.0,88588.0,88588.0,88588.0,88588.0,88588.0,88588.0
mean,0.52217,0.500801,-0.074811,-0.562585,-0.313956,0.00416,0.037203,0.022327
std,0.499511,0.500002,1.009299,0.658458,0.486815,1.253423,1.198725,1.914423
min,0.0,0.0,-5.3505,-3.299,-3.7538,-4.4306,-7.4647,-9.48
25%,0.0,0.0,-0.3818,-1.0335,-0.376,-0.9207,-0.644825,-1.345125
50%,1.0,1.0,-0.0595,-0.7591,-0.221,0.0187,0.0393,0.0069
75%,1.0,1.0,0.3555,-0.241775,-0.0859,0.8888,0.7337,1.3982
max,1.0,1.0,5.6033,2.668,1.6403,4.8742,8.498,11.2662


In [57]:
df.head()

Unnamed: 0,date,time,username,wrist,activity,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z
0,2017-6-30,13:51:15:847724020,viktor,0,0,0.265,-0.7814,-0.0076,-0.059,0.0325,-2.9296
1,2017-6-30,13:51:16:246945023,viktor,0,0,0.6722,-1.1233,-0.2344,-0.1757,0.0208,0.1269
2,2017-6-30,13:51:16:446233987,viktor,0,0,0.4399,-1.4817,0.0722,-0.9105,0.1063,-2.4367
3,2017-6-30,13:51:16:646117985,viktor,0,0,0.3031,-0.8125,0.0888,0.1199,-0.4099,-2.9336
4,2017-6-30,13:51:16:846738994,viktor,0,0,0.4814,-0.9312,0.0359,0.0527,0.4379,2.4922


In [58]:
# 2.List the columns in the dataset.
df.columns

Index(['date', 'time', 'username', 'wrist', 'activity', 'acceleration_x',
       'acceleration_y', 'acceleration_z', 'gyro_x', 'gyro_y', 'gyro_z'],
      dtype='object')

In [21]:
# DATA WRANGLING
df.isna().any()

date              False
time              False
username          False
wrist             False
activity          False
acceleration_x    False
acceleration_y    False
acceleration_z    False
gyro_x            False
gyro_y            False
gyro_z            False
dtype: bool

In [50]:
# 3.Let the target variable “y” be the activity, and assign all the columns after it to “x.”
y = df['activity']
x = df.drop(['activity', 'date'], axis = 1)

x.head()

Unnamed: 0,time,username,wrist,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z
0,13:51:15:847724020,viktor,0,0.265,-0.7814,-0.0076,-0.059,0.0325,-2.9296
1,13:51:16:246945023,viktor,0,0.6722,-1.1233,-0.2344,-0.1757,0.0208,0.1269
2,13:51:16:446233987,viktor,0,0.4399,-1.4817,0.0722,-0.9105,0.1063,-2.4367
3,13:51:16:646117985,viktor,0,0.3031,-0.8125,0.0888,0.1199,-0.4099,-2.9336
4,13:51:16:846738994,viktor,0,0.4814,-0.9312,0.0359,0.0527,0.4379,2.4922


In [38]:
# LABEL ENCODING
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
uname = df['username']
lb.fit(uname)
label = lb.transform(uname)

In [51]:
x['username'] = label

In [41]:
x = x.drop('time', axis = 1)

In [42]:
# SPLIT THE DATA
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=100)

In [43]:
# INIT OUR MODEL

# 4.Using Scikit-learn, fit a Gaussian Naive Bayes model and observe the accuracy.
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(x_train, y_train)

GaussianNB()

In [45]:
# PREDICTION TIME
y_pred = model.predict(x_test)

In [61]:
# ACCURACY CHECK
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# 5.Generate a classification report using Scikit-learn.
print(classification_report(y_test, y_pred, target_names = ['Walk', 'Run']))

0.9547352974376341
[[8799   99]
 [ 703 8117]]
              precision    recall  f1-score   support

        Walk       0.93      0.99      0.96      8898
         Run       0.99      0.92      0.95      8820

    accuracy                           0.95     17718
   macro avg       0.96      0.95      0.95     17718
weighted avg       0.96      0.95      0.95     17718



In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88588 entries, 0 to 88587
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   date            88588 non-null  object 
 1   time            88588 non-null  object 
 2   username        88588 non-null  object 
 3   wrist           88588 non-null  int64  
 4   activity        88588 non-null  int64  
 5   acceleration_x  88588 non-null  float64
 6   acceleration_y  88588 non-null  float64
 7   acceleration_z  88588 non-null  float64
 8   gyro_x          88588 non-null  float64
 9   gyro_y          88588 non-null  float64
 10  gyro_z          88588 non-null  float64
dtypes: float64(6), int64(2), object(3)
memory usage: 7.4+ MB


In [64]:
# 6.Repeat the model once using only the acceleration values as predictors and then using only the gyro values as predictors
# acceleration
x, y = df.iloc[:, [5, 6, 7]].values, df.iloc[:, 4].values

In [67]:
# SPLIT DATA
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=101)

In [68]:
# INIT MODEL
model.fit(x_train, y_train)

GaussianNB()

In [69]:
# PREDICTION USING ACCELRATION
y_pred_acc = model.predict(x_test)

In [76]:
# ACCURACY CHECK ACCELERATION
print(accuracy_score(y_test, y_pred_acc))
print(confusion_matrix(y_test, y_pred_acc))
print(classification_report(y_test, y_pred_acc, target_names = ['Walk', 'Run']))

0.4985325657523422
[[4634 4039]
 [4846 4199]]
              precision    recall  f1-score   support

        Walk       0.49      0.53      0.51      8673
         Run       0.51      0.46      0.49      9045

    accuracy                           0.50     17718
   macro avg       0.50      0.50      0.50     17718
weighted avg       0.50      0.50      0.50     17718



In [72]:
# gyro
x, y = df.iloc[:, [8,9,10]].values, df.iloc[:, 4].values

In [73]:
# SPLIT THE DATA
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [74]:
# INIT MODEL
model.fit(x_train, y_train)

GaussianNB()

In [77]:
# PREDICTION USING GYRO
y_pred_gyro = model.predict(x_test)

In [78]:
# ACCURACY 
print(accuracy_score(y_test, y_pred_gyro))
print(confusion_matrix(y_test, y_pred_gyro))
print(classification_report(y_test, y_pred_gyro, target_names = ['Walk', 'Run']))

0.6475335816683598
[[6528 2145]
 [4100 4945]]
              precision    recall  f1-score   support

        Walk       0.61      0.75      0.68      8673
         Run       0.70      0.55      0.61      9045

    accuracy                           0.65     17718
   macro avg       0.66      0.65      0.64     17718
weighted avg       0.66      0.65      0.64     17718

