## Fitbit data pre-processing and training

In [1]:
# importing the needed packages

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler



In [2]:
# importing the data

FBdata = pd.read_csv('data_refined.csv', index_col='Unnamed: 0')

In [3]:
# A look at our data

FBdata.head()

Unnamed: 0,Id,Steps,Distance,High_Activity_Dist,Mod_Activity_dist,Light_Activity_dist,High_Activity_Min,Mod_Activity_Min,Light_Activity_Min,Sed_Min,Calories
0,1503960366,13162.0,8.5,1.88,0.55,6.06,25.0,13.0,328.0,728.0,1985.0
1,1503960366,10735.0,6.97,1.57,0.69,4.71,21.0,19.0,217.0,776.0,1797.0
2,1503960366,10460.0,6.74,2.44,0.4,3.91,30.0,11.0,181.0,1218.0,1776.0
3,1503960366,9762.0,6.28,2.14,1.26,2.83,29.0,34.0,209.0,726.0,1745.0
4,1503960366,12669.0,8.16,2.71,0.41,5.04,36.0,10.0,221.0,773.0,1863.0


In [4]:
# make sure our data types are correct

FBdata.dtypes

Id                       int64
Steps                  float64
Distance               float64
High_Activity_Dist     float64
Mod_Activity_dist      float64
Light_Activity_dist    float64
High_Activity_Min      float64
Mod_Activity_Min       float64
Light_Activity_Min     float64
Sed_Min                float64
Calories               float64
dtype: object

In [5]:
# We dont need the user ID to help our prediction model, at least not with this small dataset.

FBdata = FBdata.drop('Id', axis=1)

In [6]:
# make sure our Id column was dropped

FBdata.head()

Unnamed: 0,Steps,Distance,High_Activity_Dist,Mod_Activity_dist,Light_Activity_dist,High_Activity_Min,Mod_Activity_Min,Light_Activity_Min,Sed_Min,Calories
0,13162.0,8.5,1.88,0.55,6.06,25.0,13.0,328.0,728.0,1985.0
1,10735.0,6.97,1.57,0.69,4.71,21.0,19.0,217.0,776.0,1797.0
2,10460.0,6.74,2.44,0.4,3.91,30.0,11.0,181.0,1218.0,1776.0
3,9762.0,6.28,2.14,1.26,2.83,29.0,34.0,209.0,726.0,1745.0
4,12669.0,8.16,2.71,0.41,5.04,36.0,10.0,221.0,773.0,1863.0


In [7]:
# creating our X and Y variables with our data

X = FBdata.drop('Calories', axis=1)
Y = FBdata['Calories']

In [8]:
# A look at our predictor columns

X.head()

Unnamed: 0,Steps,Distance,High_Activity_Dist,Mod_Activity_dist,Light_Activity_dist,High_Activity_Min,Mod_Activity_Min,Light_Activity_Min,Sed_Min
0,13162.0,8.5,1.88,0.55,6.06,25.0,13.0,328.0,728.0
1,10735.0,6.97,1.57,0.69,4.71,21.0,19.0,217.0,776.0
2,10460.0,6.74,2.44,0.4,3.91,30.0,11.0,181.0,1218.0
3,9762.0,6.28,2.14,1.26,2.83,29.0,34.0,209.0,726.0
4,12669.0,8.16,2.71,0.41,5.04,36.0,10.0,221.0,773.0


In [9]:
# looking at our target variable

Y.head()

0    1985.0
1    1797.0
2    1776.0
3    1745.0
4    1863.0
Name: Calories, dtype: float64

In [10]:
# Checking the shape of our X and Y sets

X.shape


(844, 9)

In [11]:
Y.shape

(844,)

In [12]:
# Creating our train and test sets, setting our test size at 25%

X_train, Y_train, X_test, Y_test = train_test_split(X,Y, test_size=0.25, random_state=63)

In [13]:
# scaling our predictor variables

std = StandardScaler()
std.fit(X_train)
std_X = std.transform(X_train)

In [14]:
X.to_csv('X_data')
Y.to_csv('Y_data')