In [22]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier


data = pd.read_csv('anom5j.csv')
series = data['ADATE'].str.split('/')
data[['AMONTH','ADAY','AYEAR']] = pd.DataFrame(series.values.tolist(), index= data.index)
data['AMONTH'] = pd.to_numeric(data['AMONTH'])
data['ADAY'] = pd.to_numeric(data['ADAY'])
data['AYEAR'] = pd.to_numeric(data['AYEAR'])

data_tree = data.drop(['EDATE', 'ADATE', 'ACOMMENT', 'BIRD'], axis=1)

labels = data_tree['ORBIT'].unique().tolist()
mapping = dict( zip(labels,range(len(labels))) )
data_tree.replace({'ORBIT': mapping},inplace=True)

labels = data_tree['NS'].unique().tolist()
mapping = dict( zip(labels,range(len(labels))) )
data_tree.replace({'NS': mapping},inplace=True)

labels = data_tree['EW'].unique().tolist()
mapping = dict( zip(labels,range(len(labels))) )
data_tree.replace({'EW': mapping},inplace=True)

labels = data_tree['ATYPE'].unique().tolist()
mapping = dict( zip(labels,range(len(labels))) )
data_tree.replace({'ATYPE': mapping},inplace=True)

labels = data_tree['ADIAG'].unique().tolist()
mapping = dict( zip(labels,range(len(labels))) )
data_tree.replace({'ADIAG': mapping},inplace=True)

labels = data_tree['SPIN'].unique().tolist()
mapping = dict( zip(labels,range(len(labels))) )
data_tree.replace({'SPIN': mapping},inplace=True)

num_bounds = 5
data['STIMEU'] = pd.cut(x=data['STIMEU'], bins = np.linspace(data['STIMEU'].min(), \
                                        data['STIMEU'].max(), num_bounds), include_lowest=True)
data['STIMEL'] = pd.cut(x=data['STIMEL'], bins = np.linspace(data['STIMEL'].min(), \
                                        data['STIMEL'].max(), num_bounds), include_lowest=True)
data['LAT'] = pd.cut(x=data['LAT'], bins = np.linspace(data['LAT'].min(), \
                                        data['LAT'].max(), num_bounds), include_lowest=True)
data['LON'] = pd.cut(x=data['LON'], bins = np.linspace(data['LON'].min(), \
                                        data['LON'].max(), num_bounds), include_lowest=True)
data['ALT'] = pd.cut(x=data['ALT'], bins = np.linspace(data['ALT'].min(), \
                                        data['ALT'].max(), num_bounds), include_lowest=True)
data['SVE'] = pd.cut(x=data['SVE'], bins = np.linspace(data['SVE'].min(), \
                                        data['SVE'].max(), num_bounds), include_lowest=True)

data_tree = data_tree.fillna(0)
print(data_tree.head())

   VER  STIMEU  STIMEQ  DUR  STIMEL  ORBIT  NS  LAT  LATQ  EW  LON  LONQ  \
0  5.0    2000     0.0    0  1632.0      0   0    0   0.0   0  308   0.0   
1  5.0     838     0.0    0  2138.0      0   0    0   0.0   0  195   0.0   
2  5.0    1408     0.0    0   120.0      1   1    5   0.0   0  168   0.0   
3  5.0    1915     0.0    0  1219.0      0   1    0   0.0   1  104   0.0   
4  5.0     338     0.0    0  2042.0      0   1    0   0.0   1  104   0.0   

     ALT  ATYPE  ADIAG  SVE  SPIN  AMONTH  ADAY  AYEAR  
0  35784      0      0  0.0     0       9    11   1990  
1  35784      0      0  0.0     0       4    15   1992  
2  54810      0      0  0.0     0      10    27   1987  
3  35784      1      1  0.0     0      10     4   1978  
4  35784      1      1  0.0     0       6    15   1974  


In [18]:
#get feature list and train/test data split

#features = ['STIMEU', 'DUR', 'STIMEL', 'ORBIT', 'NS', 'LAT', 'EW', 'LON', 'ALT', 'SVE', 'AMONTH', 'ADAY', 'AYEAR']
#above is 82% used for midterm report
features = ['ORBIT', 'NS', 'LAT', 'EW', 'LON', 'ALT', 'SVE', 'AMONTH'] #84%
#features = ['ORBIT', 'LAT', 'LON', 'SVE', 'ALT'] #82%
#label = 'ATYPE' #used for midterm report
label = 'ATYPE'
xtrain, xtest, ytrain, ytest = train_test_split(data_tree[features], data_tree[label], test_size=0.20, random_state=42)
#x - feature lists
#y - anomaly type lists

In [19]:
#decision tree
clf=tree.DecisionTreeClassifier()
clf=clf.fit(xtrain,ytrain)

test_predicted = clf.predict(xtest)
test_actual = list(ytest.values)
train_predicted = clf.predict(xtrain)
train_actual = list(ytrain.values)
score_test = 0
score_train = 0
for i in range(len(test_predicted)):
    if test_predicted[i] == test_actual[i]:
        score_test = score_test + 1
    if train_predicted[i] == train_actual[i]:
        score_train = score_train + 1
score_test = score_test / len(test_predicted)
score_train = score_train / len(train_predicted)
print('Test Accuracy: %.6f' %score_test)
print('Train Accuracy: %.6f' %score_train)

Test Accuracy: 0.823237
Train Accuracy: 0.224789


In [23]:
#Bagged decision tree

clf_bt=BaggingClassifier()
clf_bt=clf_bt.fit(xtrain,ytrain)
test_predicted = clf_bt.predict(xtest)
test_actual = list(ytest.values)
train_predicted = clf_bt.predict(xtrain)
train_actual = list(ytrain.values)
score_test = 0
score_train = 0
for i in range(len(test_predicted)):
    if test_predicted[i] == test_actual[i]:
        score_test = score_test + 1
    if train_predicted[i] == train_actual[i]:
        score_train = score_train + 1
score_test = score_test / len(test_predicted)
score_train = score_train / len(train_predicted)
print('Test Accuracy: %.6f' %score_test)
print('Train Accuracy: %.6f' %score_train)

Test Accuracy: 0.825223
Train Accuracy: 0.224540
