In [137]:
import pandas as pd
import numpy as np
import ast
import re

In [138]:
df_original = pd.read_csv('diabetic_data_embed_8.csv')

In [139]:
df = df_original.drop(columns=['diag_1','diag_2','diag_3','desc_diag_1','desc_diag_2','desc_diag_3','encounter_id','patient_nbr'])

In [140]:
def str_to_array(s):
    s = re.sub(r'\s+', ' ', s)  
    s = s.replace(' ', ',')
    s = s.replace('[,', '[').replace(',]', ']')
    return np.array(ast.literal_eval(s), dtype=np.float32)

df["red_emb_daig_1"] = df["red_emb_daig_1"].apply(str_to_array)
df["red_emb_daig_2"] = df["red_emb_daig_2"].apply(str_to_array)
df["red_emb_daig_3"] = df["red_emb_daig_3"].apply(str_to_array)

In [143]:
df['red_emb_daig_1'][2]

array([ 2.6285348 ,  8.045067  , -2.203428  ,  1.3610619 ,  0.09296808,
        2.1891315 ,  1.5816176 ,  4.4955897 ], dtype=float32)

In [144]:
from sklearn.ensemble import IsolationForest

emb1_df = pd.DataFrame(df['red_emb_daig_1'].tolist(), index=df.index).add_prefix("emb1_")
emb2_df = pd.DataFrame(df['red_emb_daig_2'].tolist(), index=df.index).add_prefix("emb2_")
emb3_df = pd.DataFrame(df['red_emb_daig_3'].tolist(), index=df.index).add_prefix("emb3_")

df_flat = pd.concat([df.drop(columns=['red_emb_daig_1','red_emb_daig_2','red_emb_daig_3']), emb1_df,emb2_df,emb3_df], axis=1)

iso_forest = IsolationForest(contamination='auto', random_state=42)
anomaly_score = iso_forest.fit_predict(df_flat)

print(anomaly_score)


[1 1 1 ... 1 1 1]


In [145]:
num_outliers = np.count_nonzero(anomaly_score == -1)
print('Num outliers: ',num_outliers)

Num outliers:  269


In [147]:
df_original[anomaly_score == -1] 

Unnamed: 0,encounter_id,patient_nbr,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,...,metformin-rosiglitazone_No,metformin-rosiglitazone_Steady,metformin-pioglitazone_No,metformin-pioglitazone_Steady,desc_diag_1,desc_diag_2,desc_diag_3,red_emb_daig_1,red_emb_daig_2,red_emb_daig_3
370,3557748,1582326,2,3,48,1,10,0,0,0,...,1,0,1,0,{'specific': 'Abscess of anal and rectal regio...,"{'specific': 'Diabetes with ketoacidosis, type...","{'specific': 'Overweight, obesity and other hy...",[-3.6183654e-03 5.1756101e+00 -5.4701777e+00 ...,[-0.54608184 0.01942276 -2.4937382 2.921827...,[ 1.4890056 -4.163954 -1.5180945 6.406319 ...
417,3828228,2798721,5,3,52,0,7,0,0,0,...,1,0,1,0,"{'specific': 'Episodic mood disorders', 'gener...",{'specific': 'Diabetes mellitus without mentio...,"{'specific': 'Essential hypertension', 'genera...",[-2.2130122 0.1036889 3.8040411 4.769157...,[-1.4277354 0.91866505 -2.2971203 3.576293...,[-6.243382 5.167587 -1.1617622 -3.284549...
1117,8107908,107892387,9,9,67,1,22,0,0,0,...,1,0,1,0,{'specific': 'Diabetes with other specified ma...,{'specific': 'Diabetes with peripheral circula...,"{'specific': 'Atherosclerosis', 'general': 'Di...",[ 5.0958676 -1.6136944 -1.4674602 3.629662...,[ 0.19818313 1.1759032 -2.1685846 4.538285...,[-0.99798894 4.012953 -1.4307932 -5.138595...
1581,10736988,1181394,6,12,77,3,16,0,0,0,...,1,0,1,0,"{'specific': 'Intracerebral hemorrhage', 'gene...",{'specific': 'Pneumonitis due to solids and li...,{'specific': 'Other disorders of urethra and u...,[-3.2421668 -8.212658 -6.4231668 -4.226085...,[ 1.5001694 -9.006839 4.149265 6.2186136 ...,[ 2.6443021 3.9256835 0.09089799 -5.592125...
2576,16711068,474318,8,12,72,6,38,0,0,0,...,1,0,1,0,"{'specific': 'Acute myocardial infarction', 'g...","{'specific': 'Cardiac dysrhythmias', 'general'...","{'specific': 'Diabetes mellitus', 'general': '...",[ 6.7098236 -7.2191997 -3.8937728 -0.926594...,[ -0.20285891 -0.58339536 -3.6657057 -13.16...,[-1.4334682 -0.46425417 -4.716138 0.460702...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97714,392021612,179033729,7,7,58,2,38,1,0,0,...,1,0,1,0,{'specific': 'Arterial embolism and thrombosis...,"{'specific': 'Atherosclerosis', 'general': 'Di...","{'specific': 'Atherosclerosis', 'general': 'Di...",[-0.9058573 -6.875967 -5.3150334 -0.646919...,[-0.99798894 4.012953 -1.4307932 -5.138595...,[-0.99798894 4.012953 -1.4307932 -5.138595...
98667,403770110,138996032,7,9,58,0,25,0,0,0,...,1,0,1,0,"{'specific': 'Chronic bronchitis', 'general': ...","{'specific': 'Pneumonia, organism unspecified'...",{'specific': 'Other forms of chronic ischemic ...,[10.242852 2.2308917 -0.94504124 4.535682...,[ 3.724545 0.94338727 -10.897309 1.31...,[-4.5678825 7.033985 -1.4268639 -0.7593923 ...
100786,430820624,123302993,6,12,87,6,39,0,0,0,...,1,0,1,0,"{'specific': 'Empyema', 'general': 'Other Dise...","{'specific': 'Pneumonia, organism unspecified'...",{'specific': 'Other complications of procedure...,[ 0.67785007 2.0353396 -7.1886764 -6.318625...,[ 3.724545 0.94338727 -10.897309 1.31...,[-3.1170726 -3.1941028 -0.6447397 0.223207...
101228,436694120,71373429,8,14,81,4,23,0,0,1,...,1,0,1,0,{'specific': 'Intestinal obstruction without m...,"{'specific': 'Heart failure', 'general': 'Othe...",{'specific': 'Other disorders of urethra and u...,[ 2.0383792 3.0220017 -7.227021 -6.6406827 ...,[-3.8041878 7.104007 -2.5381896 4.574203...,[ 2.6443021 3.9256835 0.09089799 -5.592125...
