## Random Forest Analysis

In [91]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
%matplotlib inline

In [92]:
# put data together
act = pd.read_table('/Users/amybrown/Thinkful/Unit_4/Lesson_2/activity_labels.txt', header=None, sep=' ', names=('ID','Activity'))
type(act)

act.columns

features = pd.read_table('/Users/amybrown/Thinkful/Unit_4/Lesson_2/features.txt', sep=' ', header=None, names=('ID','Sensor'))
features.head()
features.info

testSub = pd.read_table('/Users/amybrown/Thinkful/Unit_4/Lesson_2/test/subject_test.txt',  header=None, names=['SubjectID'])
testSub.shape
testSub.head()

testX = pd.read_table('/Users/amybrown/Thinkful/Unit_4/Lesson_2/test/X_test.txt', sep='\s+', header=None)
testX.head()
testX.shape

testY = pd.read_table('/Users/amybrown/Thinkful/Unit_4/Lesson_2/test/y_test.txt', sep=' ', header=None)
testY.head()
testY.tail()
testY.columns = ['ActivityID']

trainSub = pd.read_table('/Users/amybrown/Thinkful/Unit_4/Lesson_2/train/subject_train.txt', header=None, names=['SubjectID'])
trainSub.shape

trainX = pd.read_table('/Users/amybrown/Thinkful/Unit_4/Lesson_2/train/X_train.txt', sep='\s+', header=None)
trainX.shape

trainY = pd.read_table('/Users/amybrown/Thinkful/Unit_4/Lesson_2/train/y_train.txt', sep=' ', header=None, names=['ActivityID'])
trainY.shape

sensorNames = features['Sensor']

testX.columns = sensorNames
trainX.columns = sensorNames

testX = pd.concat([testX, testSub], axis=1)
trainX = pd.concat([trainX, trainSub], axis=1)

for i in act['ID']:
    activity = act[act['ID'] == i]['Activity']  # get activity cell given ID
    trainY = trainY.replace({i: activity.iloc[0]})  # replace this ID with activity string
    
for i in act['ID']:
    activity = act[act['ID'] == i]['Activity']  # get activity cell given ID
    testY = testY.replace({i: activity.iloc[0]})  # replace this ID with activity string

testY.columns = ['Activity']
trainY.columns = ['Activity']

In [93]:
# data cleaning: per Thinkful lesson instructions
cols = [col for col in testX.columns if 'band' not in col]
testX=testX[cols]
cols2 = [col for col in testX.columns if 'Mag' not in col]
testX=testX[cols2]

testX.columns = [c_name.replace("()", "") for c_name in testX.columns.values.tolist()]
testX.columns = [c_name.replace(")", "") for c_name in testX.columns.values.tolist()]
testX.columns = [c_name.replace("(", "") for c_name in testX.columns.values.tolist()]
testX.columns = [c_name.replace("-", "") for c_name in testX.columns.values.tolist()]
testX.columns = [c_name.replace(",", "") for c_name in testX.columns.values.tolist()]
testX.columns = [c_name.replace("BodyBody", "") for c_name in testX.columns.values.tolist()]
testX.columns = [c_name.replace("Body", "") for c_name in testX.columns.values.tolist()]
testX=testX.T.drop_duplicates().T



names = trainX.columns.values

cols = [col for col in trainX.columns if 'band' not in col]
trainX=trainX[cols]
cols2 = [col for col in trainX.columns if 'Mag' not in col]
trainX=trainX[cols2]

trainX.columns = [c_name.replace("()", "") for c_name in trainX.columns.values.tolist()]
trainX.columns = [c_name.replace(")", "") for c_name in trainX.columns.values.tolist()]
trainX.columns = [c_name.replace("(", "") for c_name in trainX.columns.values.tolist()]
trainX.columns = [c_name.replace("-", "") for c_name in trainX.columns.values.tolist()]
trainX.columns = [c_name.replace(",", "") for c_name in trainX.columns.values.tolist()]
trainX.columns = [c_name.replace("BodyBody", "") for c_name in trainX.columns.values.tolist()]
trainX.columns = [c_name.replace("Body", "") for c_name in trainX.columns.values.tolist()]

In [94]:
rf = RandomForestClassifier(n_estimators=500, bootstrap=True)
rf.fit(trainX, trainY)

  from ipykernel import kernelapp as app


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [95]:
importances = rf.feature_importances_
importances = pd.DataFrame(importances, index=trainX.columns, columns=["Importance"])

In [96]:
importances["Std"] = np.std([tree.feature_importances_
                            for tree in rf.estimators_], axis=0)

In [97]:
x = range(importances.shape[0])
y = importances.ix[:, 0]
yerr = importances.ix[:, 1]

In [98]:
plt.bar(x, y, yerr=yerr, align="center")

In [99]:
importances.sort_values(by='Importance', ascending=0).head(10)
# figure out how to plot feature importances by relative importance (see yhat blog)

Unnamed: 0,Importance,Std
tGravityAccmeanX,0.039295,0.075028
tGravityAccenergyX,0.032632,0.067056
tGravityAccmaxY,0.031771,0.048781
angleXgravityMean,0.029894,0.064283
tGravityAccminX,0.028906,0.061365
tGravityAccmeanY,0.028539,0.043495
angleYgravityMean,0.027861,0.043627
tGravityAccminY,0.025344,0.037687
tGravityAccmaxX,0.024385,0.056138
tAccmaxX,0.020027,0.044946


In [100]:
results = rf.predict(testX)
#testY['predictions'] = results

In [101]:
# mean accuracy score on testing data
rf.score(testX, testY)

0.92331184255174759

In [102]:
y_true = np.array(testY['Activity'])
y_pred = results

In [103]:
precision_recall_fscore_support(y_true, y_pred, average='macro')

(0.92512495775812231, 0.92041561671497141, 0.92175491806906296, None)

In [104]:
precision_recall_fscore_support(y_true, y_pred, average='micro')

(0.92331184255174759, 0.92331184255174759, 0.92331184255174759, None)

In [105]:
precision_recall_fscore_support(y_true, y_pred, average='weighted')

(0.92479888478393235, 0.92331184255174759, 0.92314217433206625, None)

In [107]:
confusion_matrix(y_true, y_pred)

array([[537,   0,   0,   0,   0,   0],
       [  0, 430,  61,   0,   0,   0],
       [  0,  48, 484,   0,   0,   0],
       [  0,   0,   0, 481,   4,  11],
       [  0,   0,   0,  18, 357,  45],
       [  0,   0,   0,  34,   5, 432]])