# Decision Tree "Rough Draft"
##### Getting a decision tree output so we have some results to evaluate

### Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
df = pd.read_csv('Event Recordings/vol1.csv')
# Display a row with a NaN value
print(df.iloc[2249])

Timestamp      68.228505
X                    373
Y                   1481
BTN_TOUCH            NaN
TOUCH_MAJOR         -420
TOUCH_MINOR         -420
TRACKING_ID         2594
PRESSURE              29
FINGER                 1
Name: 2249, dtype: object


In [3]:
df = df.dropna()
# Print same row as before, see that it has been removed
print(df.iloc[2249])

Timestamp      68.278493
X                    371
Y                   1481
BTN_TOUCH           HELD
TOUCH_MAJOR         -420
TOUCH_MINOR         -420
TRACKING_ID         2594
PRESSURE              29
FINGER                 1
Name: 2256, dtype: object


In [4]:
df.head()

Unnamed: 0,Timestamp,X,Y,BTN_TOUCH,TOUCH_MAJOR,TOUCH_MINOR,TRACKING_ID,PRESSURE,FINGER
0,0.0,350,1404,DOWN,11,-420,2526,38,0
1,0.032676,348,1406,HELD,11,-420,2526,38,0
2,0.042109,347,1408,HELD,11,-420,2526,38,0
3,0.049881,344,1411,HELD,11,-420,2526,38,0
4,0.058581,340,1416,HELD,11,-420,2526,38,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 59435 entries, 0 to 59564
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Timestamp    59435 non-null  float64
 1   X            59435 non-null  int64  
 2   Y            59435 non-null  int64  
 3   BTN_TOUCH    59435 non-null  object 
 4   TOUCH_MAJOR  59435 non-null  int64  
 5   TOUCH_MINOR  59435 non-null  int64  
 6   TRACKING_ID  59435 non-null  int64  
 7   PRESSURE     59435 non-null  int64  
 8   FINGER       59435 non-null  int64  
dtypes: float64(1), int64(7), object(1)
memory usage: 4.5+ MB


##### We need a way to indicate to the model which data is the the correct user's (i.e., create a dummy variable for correct user). Then we must pass the rest of the user's data into the model to get our results.

##### For the sake of time, we are going to just compare volunteer 1's data to volunteer 2's data

In [6]:
# We need to create a dummy variable for correct user (1 for true, 2 for false)
# This should be applied to every row of current vol1 file, then we append vol2 and set these all to 0

# Add new column CORRECT_USER
df['CORRECT_USER'] = 1
df.head()

Unnamed: 0,Timestamp,X,Y,BTN_TOUCH,TOUCH_MAJOR,TOUCH_MINOR,TRACKING_ID,PRESSURE,FINGER,CORRECT_USER
0,0.0,350,1404,DOWN,11,-420,2526,38,0,1
1,0.032676,348,1406,HELD,11,-420,2526,38,0,1
2,0.042109,347,1408,HELD,11,-420,2526,38,0,1
3,0.049881,344,1411,HELD,11,-420,2526,38,0,1
4,0.058581,340,1416,HELD,11,-420,2526,38,0,1


In [7]:
# Check the tail as well
df.tail()

Unnamed: 0,Timestamp,X,Y,BTN_TOUCH,TOUCH_MAJOR,TOUCH_MINOR,TRACKING_ID,PRESSURE,FINGER,CORRECT_USER
59560,898.745893,710,1825,HELD,-420,-420,3509,23,1,1
59561,898.779799,677,1824,HELD,9,3,4294967295,31,0,1
59562,898.846378,677,1823,HELD,9,3,4294967295,31,0,1
59563,898.871367,677,303,HELD,9,3,4294967295,31,0,1
59564,898.904673,677,304,HELD,9,3,4294967295,31,0,1


# Create a dummy variable to change BTN_TOUCH to a numerical value

In [8]:
df = pd.get_dummies(df,columns=['BTN_TOUCH'], dtype=int, drop_first=True)

df.tail()

Unnamed: 0,Timestamp,X,Y,TOUCH_MAJOR,TOUCH_MINOR,TRACKING_ID,PRESSURE,FINGER,CORRECT_USER,BTN_TOUCH_HELD,BTN_TOUCH_UP
59560,898.745893,710,1825,-420,-420,3509,23,1,1,1,0
59561,898.779799,677,1824,9,3,4294967295,31,0,1,1,0
59562,898.846378,677,1823,9,3,4294967295,31,0,1,1,0
59563,898.871367,677,303,9,3,4294967295,31,0,1,1,0
59564,898.904673,677,304,9,3,4294967295,31,0,1,1,0


In [9]:
# Read in volunteer 2's data
df2 = pd.read_csv('Event Recordings/vol2.csv')
# Drop all rows with NaN values
df2 = df2.dropna()

# Set CORRECT_USER column
df2['CORRECT_USER'] = 0
df2 = pd.get_dummies(df2,columns=['BTN_TOUCH'],dtype=int, drop_first=True)
df2.tail()

Unnamed: 0,Timestamp,X,Y,TOUCH_MAJOR,TOUCH_MINOR,TRACKING_ID,PRESSURE,FINGER,CORRECT_USER,BTN_TOUCH_HELD,BTN_TOUCH_UP
39900,900.494168,422,336,11,-420,4423,24,1,0,1,0
39901,900.502887,300,1588,10,3,4422,27,0,0,1,0
39902,900.502887,419,336,11,-420,4423,24,1,0,1,0
39903,900.511139,301,1583,10,3,4422,27,0,0,1,0
39904,900.511139,417,336,11,-420,4423,24,1,0,1,0


In [10]:
# Append user 1 and user 2's dataframes
df3 = pd.concat([df, df2], ignore_index=True)

# Display the tail to ensure CORRECT_USER isn't preset
df3.tail()

Unnamed: 0,Timestamp,X,Y,TOUCH_MAJOR,TOUCH_MINOR,TRACKING_ID,PRESSURE,FINGER,CORRECT_USER,BTN_TOUCH_HELD,BTN_TOUCH_UP
99253,900.494168,422,336,11,-420,4423,24,1,0,1,0
99254,900.502887,300,1588,10,3,4422,27,0,0,1,0
99255,900.502887,419,336,11,-420,4423,24,1,0,1,0
99256,900.511139,301,1583,10,3,4422,27,0,0,1,0
99257,900.511139,417,336,11,-420,4423,24,1,0,1,0


### Now, train test split
##### We need to train the data of only the correct user (70% of their data) and test the data with the remaining data (30% both correct user and incorrect users).

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
# Set x data to everything but the target (of correct users)
X = df.drop('CORRECT_USER', axis=1)

# Use the variable below to test against all users
X1 = df3.drop('CORRECT_USER', axis=1)

In [13]:
# Do the same for y
y = df['CORRECT_USER']

# Use the variable below to test against all users
y1 = df3['CORRECT_USER']

# **I THINK THE PROBLEM IS HERE**

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.3)

'''
# Data to be trained (70% of user 1's data)
X_train = train_test_split(X, y, train_size=0.7, random_state=101)
y_train = train_test_split(X, y, train_size=0.7, random_state=101)

# Data to be tested (30% of both correct and incorrect data)
X_test = train_test_split(X1, y1, test_size=0.3, random_state=101)
y_test = train_test_split(X1, y1, test_size=0.3, random_state=101)
'''

"\n# Data to be trained (70% of user 1's data)\nX_train = train_test_split(X, y, train_size=0.7, random_state=101)\ny_train = train_test_split(X, y, train_size=0.7, random_state=101)\n\n# Data to be tested (30% of both correct and incorrect data)\nX_test = train_test_split(X1, y1, test_size=0.3, random_state=101)\ny_test = train_test_split(X1, y1, test_size=0.3, random_state=101)\n"

# Implementing the Decision Tree
##### Now that we've created our train test split, we can implement the Decision Tree

In [32]:
from sklearn.tree import DecisionTreeClassifier

In [33]:
dtree = DecisionTreeClassifier()

In [34]:
dtree.fit(X_train, y_train)

In [35]:
predictions=dtree.predict(X_test)

In [36]:
from sklearn.metrics import classification_report, confusion_matrix

In [37]:
print(confusion_matrix(y_test,predictions))
print('\n')
print(classification_report(y_test,predictions))

[[11901    52]
 [   56 17769]]


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11953
           1       1.00      1.00      1.00     17825

    accuracy                           1.00     29778
   macro avg       1.00      1.00      1.00     29778
weighted avg       1.00      1.00      1.00     29778

