In [1]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import minkowski

In [97]:
data = pd.read_csv('normalized_new_handpd.csv')
data.columns

Index(['_ID_EXAM', 'GENDER_MALE', 'GENDER_FEMALE', 'RIGHT_HANDED',
       'LEFT_HANDED', 'AGE', 'RMS_sp1', 'MAX_BETWEEN_ET_HT_sp1',
       'MIN_BETWEEN_ET_HT_sp1', 'STD_DEVIATION_ET_HT_sp1', 'MRT_sp1',
       'MAX_HT_sp1', 'MIN_HT_sp1', 'STD_HT_sp1',
       'CHANGES_FROM_NEGATIVE_TO_POSITIVE_BETWEEN_ET_HT_sp1', 'RMS_sp2',
       'MAX_BETWEEN_ET_HT_sp2', 'MIN_BETWEEN_ET_HT_sp2',
       'STD_DEVIATION_ET_HT_sp2', 'MRT_sp2', 'MAX_HT_sp2', 'MIN_HT_sp2',
       'STD_HT_sp2', 'CHANGES_FROM_NEGATIVE_TO_POSITIVE_BETWEEN_ET_HT_sp2',
       'RMS_sp3', 'MAX_BETWEEN_ET_HT_sp3', 'MIN_BETWEEN_ET_HT_sp3',
       'STD_DEVIATION_ET_HT_sp3', 'MRT_sp3', 'MAX_HT_sp3', 'MIN_HT_sp3',
       'STD_HT_sp3', 'CHANGES_FROM_NEGATIVE_TO_POSITIVE_BETWEEN_ET_HT_sp3',
       'RMS_sp4', 'MAX_BETWEEN_ET_HT_sp4', 'MIN_BETWEEN_ET_HT_sp4',
       'STD_DEVIATION_ET_HT_sp4', 'MRT_sp4', 'MAX_HT_sp4', 'MIN_HT_sp4',
       'STD_HT_sp4', 'CHANGES_FROM_NEGATIVE_TO_POSITIVE_BETWEEN_ET_HT_sp4'],
      dtype='object')

In [98]:
data = data[['RMS_sp1', 'MAX_BETWEEN_ET_HT_sp1',
       'MIN_BETWEEN_ET_HT_sp1', 'STD_DEVIATION_ET_HT_sp1', 'MRT_sp1',
       'MAX_HT_sp1', 'MIN_HT_sp1', 'STD_HT_sp1',
       'CHANGES_FROM_NEGATIVE_TO_POSITIVE_BETWEEN_ET_HT_sp1', 'RMS_sp2',
       'MAX_BETWEEN_ET_HT_sp2', 'MIN_BETWEEN_ET_HT_sp2',
       'STD_DEVIATION_ET_HT_sp2', 'MRT_sp2', 'MAX_HT_sp2', 'MIN_HT_sp2',
       'STD_HT_sp2', 'CHANGES_FROM_NEGATIVE_TO_POSITIVE_BETWEEN_ET_HT_sp2',
       'RMS_sp3', 'MAX_BETWEEN_ET_HT_sp3', 'MIN_BETWEEN_ET_HT_sp3',
       'STD_DEVIATION_ET_HT_sp3', 'MRT_sp3', 'MAX_HT_sp3', 'MIN_HT_sp3',
       'STD_HT_sp3', 'CHANGES_FROM_NEGATIVE_TO_POSITIVE_BETWEEN_ET_HT_sp3',
       'RMS_sp4', 'MAX_BETWEEN_ET_HT_sp4', 'MIN_BETWEEN_ET_HT_sp4',
       'STD_DEVIATION_ET_HT_sp4', 'MRT_sp4', 'MAX_HT_sp4', 'MIN_HT_sp4',
       'STD_HT_sp4', 'CHANGES_FROM_NEGATIVE_TO_POSITIVE_BETWEEN_ET_HT_sp4']]

In [89]:
# Standardising the features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)
data_scaled = pd.DataFrame(data_scaled)
data_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,4.910936,3.351743,1.721493,-0.187262,2.37889,1.467755,-0.188338,3.552752,-2.149706,-0.061344,...,0.993062,-0.43596,0.809375,0.533053,-0.170826,0.910936,-5.622769,2.082373,-0.356899,-0.464726
1,0.408014,1.27423,0.303622,-0.231224,0.856592,-0.997284,1.128409,0.118947,-1.082981,2.354906,...,-1.049388,-0.491865,-0.52569,-0.060985,-0.170809,-0.399451,-0.725062,0.040779,-0.4151,1.263709
2,3.543266,2.82224,1.42562,-0.368022,1.433463,1.446263,-0.291539,2.752666,-2.085237,0.997473,...,-0.394509,-0.435328,-0.65416,0.310038,-0.080939,0.203972,-0.874329,2.11596,-0.410889,2.028205
3,-0.238,-1.228131,-0.860849,-0.275869,0.146151,2.289735,0.29388,-0.527026,-1.297871,2.798733,...,-0.186847,0.683161,1.650287,0.267437,-0.161579,0.131527,0.48016,0.518607,0.81705,1.076459
4,0.093062,-0.672381,-1.00115,-0.283071,1.525219,-4.735578,-0.188113,0.439269,-0.453266,0.030668,...,-0.31383,-0.96322,-1.15043,-1.330182,-0.064633,-0.29838,0.423434,-0.872998,-0.348015,-0.589023


In [102]:
# Extracting the patient and healthy data
data_healthy = data.head(35)
data_patient = data.tail(31)

In [103]:
# Splitting the data into training and testing
data_healthy_train = data_healthy.head(25)
data_healthy_test = data_healthy.tail(10)

In [2]:
class pos_sa:
    '''
    Positive Selection Algorithm takes the features of the healthy patients and create a healthy sphere around the self points to cover the self space. 
    The points which are not covered by the healthy sphere are considered as the outliers i.e. unhealthy datapoints 
    '''
    
    def __init__(self, radius):
        self.radius = radius
        self.detectors = []
        
    def fit(self, data):
        for subject in data:
            self.detectors.append(subject)
            
    def predict(self, data):
        predictions = []
        min_dis = []
        for subject in data:
            distances = []
            for detector in self.detectors:
                distance = minkowski(subject, detector, p=5) # 5 is for the time series data, 37 is for Sahil's data
                distances.append(distance)
                
            min_dis.append(min(distances))
            if min(distances) <= self.radius:
                predictions.append(1)
                continue
            else:
                predictions.append(0)
                continue
        
        return predictions, min_dis

In [145]:
data_healthy_train_np = data_healthy_train.to_numpy()

In [146]:
data_healthy_test['flag'] = 1
data_patient['flag'] = 0
data_test = pd.concat([data_healthy_test, data_patient], axis=0)
data_test_np = data_test.drop(['flag'], axis=1).to_numpy()
data_test_np.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_healthy_test['flag'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_patient['flag'] = 0


(41, 36)

In [139]:
model = pos_sa(5.5)
model.fit(data_healthy_train_np)
predictions, distances = model.predict(data_test_np)
actual = data_test['flag'].to_numpy()

comparison = pd.DataFrame({'Actual': actual, 'Predicted': predictions, 'Distance': distances, 'Result': np.equal(actual, predictions)})
comparison_healthy = comparison.head(10)
comparison_patient = comparison.tail(31)
comparison_healthy

ValueError: All arrays must be of the same length

In [123]:
patient_results = comparison_patient[comparison_patient['Result'] == False]
100 - len(patient_results)/31*100

77.41935483870968

#### Time Series Data

In [3]:
healthy_ts = pd.read_csv('healthy_pca.csv')
patient_ts = pd.read_csv('patient_pca.csv')

In [5]:
# Standardising the data between 0 and 1
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
healthy_ts_scaled = pd.DataFrame(scaler.fit_transform(healthy_ts))
patient_ts_scaled = pd.DataFrame(scaler.fit_transform(patient_ts))

healthy_ts_scaled

# Shuffling the healthy dataset
healthy_ts_scaled = healthy_ts_scaled.sample(frac=1).reset_index(drop=True)
healthy_ts_scaled


Unnamed: 0,0,1,2,3,4
0,0.27027,0.593182,0.549283,0.645278,0.255129
1,0.0,0.647318,0.670518,0.615606,0.303339
2,1.0,1.0,0.338926,0.698686,0.0
3,0.274681,0.492204,0.793217,0.654517,0.370108
4,0.680824,0.378997,0.276456,0.491828,0.500479
5,0.565596,0.91107,0.299419,0.238117,0.615774
6,0.931533,0.355642,0.582989,1.0,0.515251
7,0.582356,0.554111,0.0,0.575558,0.54701
8,0.247617,0.0,0.255332,0.429234,0.581781
9,0.640756,0.86476,1.0,0.76534,0.490224


In [6]:
healthy_ts_train = healthy_ts_scaled.head(12).to_numpy()
healthy_ts_test = healthy_ts_scaled.tail(5).to_numpy()
patient_ts = patient_ts_scaled.to_numpy()


In [34]:
model_ts = pos_sa(0.28)
model_ts.fit(healthy_ts_train)
predictions_ts, distances_ts = model_ts.predict(patient_ts)
predictions_healthy_ts, distances_healthy_ts = model_ts.predict(healthy_ts_test)

In [35]:
result_healthy = pd.DataFrame({'Predicted': predictions_healthy_ts, 'Distance': distances_healthy_ts, 'Actual': np.ones(5), 'Result': np.equal(np.ones(5), predictions_healthy_ts)})
result_healthy

Unnamed: 0,Predicted,Distance,Actual,Result
0,0,0.481723,1.0,False
1,1,0.205688,1.0,True
2,0,0.386445,1.0,False
3,1,0.252109,1.0,True
4,0,0.298705,1.0,False


In [38]:
result_patient = pd.DataFrame({'Predicted': predictions_ts, 'Distance': distances_ts, 'Actual': np.zeros(26), 'Result': np.equal(np.zeros(26), predictions_ts)})
# Finding the percentage of False results
patient_results = result_patient[result_patient['Result'] == True]
print(f"Accuracy = {len(patient_results)/26*100}%")
result_patient

Accuracy = 73.07692307692307%


Unnamed: 0,Predicted,Distance,Actual,Result
0,0,0.281607,0.0,True
1,0,0.317994,0.0,True
2,0,0.451467,0.0,True
3,1,0.278736,0.0,False
4,0,0.289075,0.0,True
5,0,0.286387,0.0,True
6,0,0.306258,0.0,True
7,1,0.206616,0.0,False
8,0,0.52093,0.0,True
9,0,0.320468,0.0,True


In [11]:
check, distance_check = model_ts.predict(healthy_ts_train)
check_table = pd.DataFrame({'Predicted': check, 'Distance': distance_check, 'Actual': np.ones(12)})
check_table

Unnamed: 0,Predicted,Distance,Actual
0,1,0.0,1.0
1,1,0.0,1.0
2,1,0.0,1.0
3,1,0.0,1.0
4,1,0.0,1.0
5,1,0.0,1.0
6,1,0.0,1.0
7,1,0.0,1.0
8,1,0.0,1.0
9,1,0.0,1.0
