<font size="5">Naive_Bayes

In [1]:
from csv import reader
from math import sqrt
from math import exp
from math import pi
import pandas as pd

In [2]:
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset

In [3]:
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())

In [4]:
def str_column_to_int(dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup

In [5]:
def separate_by_class(dataset):
    separated = dict()
    for i in range(len(dataset)):
        vector = dataset[i]
        class_value = vector[-1]
        if (class_value not in separated):
            separated[class_value] = [vector[:-1]]
        else:
            separated[class_value] += [vector[:-1]]
    return separated

In [6]:
def mean(numbers):
    return sum(numbers)/float(len(numbers))

In [7]:
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
    return sqrt(variance)

In [8]:
def summarize_dataset(dataset):
    summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
    del(summaries[-1])
    return summaries

In [9]:
def summarize_by_class(dataset):
    separated = separate_by_class(dataset)
    summaries = dict()
    for class_value, rows in separated.items():
        summaries[class_value] = summarize_dataset(rows)
    return summaries

In [10]:
def calculate_probability(x, mean, stdev):
    exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
    return (1 / (sqrt(2 * pi) * stdev)) * exponent

In [11]:
def calculate_class_probabilities(summaries, row):
    total_rows = sum([summaries[label][0][2] for label in summaries])
    probabilities = dict()
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
        for i in range(len(class_summaries)):
            mean, stdev, _ = class_summaries[i]
            probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
    return probabilities

In [12]:
def predict(summaries, row):
    probabilities = calculate_class_probabilities(summaries, row)
    best_label, best_prob = None, -1
    for class_value, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = class_value
    return best_label

<font size="5">Medication as Prediction

In [29]:
# Make a prediction with Naive Bayes on Iris Dataset

df = pd.read_csv('Q1NumericalValues2.csv')
df

Unnamed: 0,Patient.ID,FRDPersonnelID,Shift,UnitId,FireStation,Fire.Station.Name,Station.Address,Station.City,Stattion.State,Station.Zip.Code,...,Dim_Medication_PK,Medication.Prescribed,Medication_Given_RXCUI_Code,Medication_Given_Description,FRDPersonnelID...Medication,Medication_Administered_Date_Time,Maleinteraction,Femaleinteraction,Mixedinteraction,Interaction
0,506561,C60CADD0-7A1F-42D7-B90E-687F0364A81E,A - Shift,M438,38,West Centreville,6001 ODay Drive,Centreville,Virginia,20120-1612,...,,No Medication Prescribed,,10,,,10,20,10,FemaleToFemale
1,530126,D2106DC5-10B0-4901-A713-2E08DCEF544B,A - Shift,M438,38,West Centreville,6001 ODay Drive,Centreville,Virginia,20120-1612,...,,No Medication Prescribed,,10,,,10,20,10,FemaleToFemale
2,530212,D2106DC5-10B0-4901-A713-2E08DCEF544B,A - Shift,M438,38,West Centreville,6001 ODay Drive,Centreville,Virginia,20120-1612,...,,No Medication Prescribed,,10,,,10,20,10,FemaleToFemale
3,510809,B0D3C99E-9E01-E211-B5F5-78E7D18CFD3C,A - Shift,M438,38,West Centreville,6001 ODay Drive,Centreville,Virginia,20120-1612,...,,No Medication Prescribed,,10,,,10,20,10,FemaleToFemale
4,1342992,BDD8C99E-9E01-E211-B5F5-78E7D18CFD3C,A - Shift,M438,38,West Centreville,6001 ODay Drive,Centreville,Virginia,20120-1612,...,433776.0,Medication Presribed,125464.0,25,0A89F8EB-32DE-4505-9DCA-CF9C68E09DCE,9/9/2020 10:52,10,20,10,FemaleToFemale
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795189,898016,62D3C99E-9E01-E211-B5F5-78E7D18CFD3C,C - Shift,M423,23,West Annandale,8914 Little River Turnpike,Fairfax,Virginia,22031-3123,...,,No Medication Prescribed,,10,,,20,10,10,MaleToMale
795190,899635,62D3C99E-9E01-E211-B5F5-78E7D18CFD3C,C - Shift,M423,23,West Annandale,8914 Little River Turnpike,Fairfax,Virginia,22031-3123,...,402822.0,Medication Presribed,4337.0,205,62D3C99E-9E01-E211-B5F5-78E7D18CFD3C,4/21/2019 6:37,20,10,10,MaleToMale
795191,899635,62D3C99E-9E01-E211-B5F5-78E7D18CFD3C,C - Shift,M423,23,West Annandale,8914 Little River Turnpike,Fairfax,Virginia,22031-3123,...,402822.0,Medication Presribed,4337.0,205,62D3C99E-9E01-E211-B5F5-78E7D18CFD3C,4/21/2019 6:37,20,10,10,MaleToMale
795192,899644,62D3C99E-9E01-E211-B5F5-78E7D18CFD3C,C - Shift,M423,23,West Annandale,8914 Little River Turnpike,Fairfax,Virginia,22031-3123,...,,No Medication Prescribed,,10,,,20,10,10,MaleToMale


In [30]:
print('The shape of our features is:', df.shape)

The shape of our features is: (795194, 40)


In [31]:
df = df[['Procedure_Performed_Description', 'Maleinteraction', 'Femaleinteraction', 'Mixedinteraction', 'Medication_Given_Description']]
df

Unnamed: 0,Procedure_Performed_Description,Maleinteraction,Femaleinteraction,Mixedinteraction,Medication_Given_Description
0,10,10,20,10,10
1,10,10,20,10,10
2,10,10,20,10,10
3,40,10,20,10,10
4,70,10,20,10,25
...,...,...,...,...,...
795189,10,20,10,10,10
795190,40,20,10,10,205
795191,130,20,10,10,205
795192,10,20,10,10,10


In [32]:
dataset = df.values.tolist()

In [33]:
model = summarize_by_class(dataset)
model

{10: [(101.8655266510781, 198.57042181897117, 409702),
  (14.125510737072311, 4.922938927356818, 409702),
  (10.85034976641559, 2.789340769659506, 409702)],
 25: [(218.96317576390703, 274.5007638845799, 45948),
  (15.045268564464177, 4.999849479498948, 45948),
  (10.66357621659267, 2.4890889114874937, 45948)],
 40: [(131.06528593951555, 171.69501538562753, 27908),
  (13.334527733983087, 4.714551788811254, 27908),
  (11.08427690984664, 3.1092537827952134, 27908)],
 55: [(237.88776225569825, 296.7230709193509, 58483),
  (14.583041225655318, 4.982626806180061, 58483),
  (10.773558128003009, 2.6715746561535694, 58483)],
 70: [(216.63511594888783, 249.15932088353554, 4226),
  (13.902035021296735, 4.878535101180385, 4226),
  (10.88973024136299, 2.847384943412317, 4226)],
 85: [(199.25233644859813, 262.5697043443521, 16371),
  (13.894080996884735, 4.876309632203291, 16371),
  (10.960234561114165, 2.946324058052037, 16371)],
 100: [(204.10808063696425, 266.0940205874224, 11806),
  (13.72522446

<font size="5">Remove the classes with zero in interactions

In [35]:
del model[475]
del model[490]

<font size="5">Select 20% as test dataset 

In [36]:
m=0.2*len(df)
test = df.sample(n=int(m))
test

Unnamed: 0,Procedure_Performed_Description,Maleinteraction,Femaleinteraction,Mixedinteraction,Medication_Given_Description
570714,670,20,10,10,10
54075,130,10,10,20,295
328660,10,10,20,10,10
164709,40,20,10,10,10
547984,10,20,10,10,10
...,...,...,...,...,...
124932,130,10,10,20,205
81847,40,10,10,20,190
172248,10,20,10,10,10
271848,40,10,10,20,10


In [37]:
test = test.values.tolist()

<font size="5"> Medication as Prediction Accuracy, got 52.9% accuracy

In [38]:
count = 0
s = len(test)
for row in test:
    label = predict(model, row[:-1])
    if label == row[-1]:
        count+=1
print('Accuracy: %s' % (count/s))

Accuracy: 0.5289364805895447


<font size="5"> Procedure as Prediction

In [39]:
df = df[['Medication_Given_Description', 'Maleinteraction', 'Femaleinteraction', 'Mixedinteraction', 'Procedure_Performed_Description']]
df

Unnamed: 0,Medication_Given_Description,Maleinteraction,Femaleinteraction,Mixedinteraction,Procedure_Performed_Description
0,10,10,20,10,10
1,10,10,20,10,10
2,10,10,20,10,10
3,10,10,20,10,40
4,25,10,20,10,70
...,...,...,...,...,...
795189,10,20,10,10,10
795190,205,20,10,10,40
795191,205,20,10,10,130
795192,10,20,10,10,10


In [40]:
dataset = df.values.tolist()

In [41]:
model = summarize_by_class(dataset)
model

{10: [(17.99398735074733, 38.31936313748388, 211554),
  (13.950480728324683, 4.888621714282824, 211554),
  (10.860867674447187, 2.804927945814861, 211554)],
 40: [(63.40537063396474, 81.5319217794772, 182660),
  (14.49523705244717, 4.974469804805277, 182660),
  (10.804554910763166, 2.7199781260472053, 182660)],
 70: [(54.48772678762006, 76.32516004143713, 9370),
  (13.811099252934898, 4.856854179380652, 9370),
  (11.15154749199573, 3.1922563901465537, 9370)],
 100: [(64.70793787748059, 86.8184283453927, 11590),
  (14.482312338222606, 4.973342294420438, 11590),
  (10.849870578084555, 2.788744663678172, 11590)],
 130: [(81.27538428110601, 90.81958825086913, 179491),
  (14.363059986294578, 4.959278621153358, 179491),
  (10.788228936269785, 2.6946289095446123, 179491)],
 160: [(125.91312340153452, 81.99823498061716, 12512),
  (15.49792199488491, 4.975344421433795, 12512),
  (10.549072890025576, 2.2780831057096353, 12512)],
 190: [(124.48575949367088, 75.50034829735631, 5688),
  (15.6100562

In [42]:
del model[1360]
del model[1690]
del model[1810]
del model[1930]
del model[1990]
del model[2020]

In [43]:
m=0.2*len(df)
test = df.sample(n=int(m))
test

Unnamed: 0,Medication_Given_Description,Maleinteraction,Femaleinteraction,Mixedinteraction,Procedure_Performed_Description
682652,10,20,10,10,10
458523,10,10,10,20,10
680365,10,20,10,10,130
620307,10,10,20,10,10
222997,10,10,10,20,10
...,...,...,...,...,...
42569,10,20,10,10,130
351311,160,20,10,10,370
104392,10,10,10,20,10
133922,10,10,10,20,130


In [44]:
test = test.values.tolist()

<font size="5"> Procedurer as Prediction Accuracy is about 33%. 

In [45]:
count = 0
s = len(test)
for row in test:
    label = predict(model, row[:-1])
    if label == row[-1]:
        count+=1
print('Accuracy: %s' % (count/s))

Accuracy: 0.33032357046743543


<font size="5"> Gender interaction as Prediction

In [46]:
df = pd.read_csv('Q1NumericalValues2.csv')
df

Unnamed: 0,Patient.ID,FRDPersonnelID,Shift,UnitId,FireStation,Fire.Station.Name,Station.Address,Station.City,Stattion.State,Station.Zip.Code,...,Dim_Medication_PK,Medication.Prescribed,Medication_Given_RXCUI_Code,Medication_Given_Description,FRDPersonnelID...Medication,Medication_Administered_Date_Time,Maleinteraction,Femaleinteraction,Mixedinteraction,Interaction
0,506561,C60CADD0-7A1F-42D7-B90E-687F0364A81E,A - Shift,M438,38,West Centreville,6001 ODay Drive,Centreville,Virginia,20120-1612,...,,No Medication Prescribed,,10,,,10,20,10,FemaleToFemale
1,530126,D2106DC5-10B0-4901-A713-2E08DCEF544B,A - Shift,M438,38,West Centreville,6001 ODay Drive,Centreville,Virginia,20120-1612,...,,No Medication Prescribed,,10,,,10,20,10,FemaleToFemale
2,530212,D2106DC5-10B0-4901-A713-2E08DCEF544B,A - Shift,M438,38,West Centreville,6001 ODay Drive,Centreville,Virginia,20120-1612,...,,No Medication Prescribed,,10,,,10,20,10,FemaleToFemale
3,510809,B0D3C99E-9E01-E211-B5F5-78E7D18CFD3C,A - Shift,M438,38,West Centreville,6001 ODay Drive,Centreville,Virginia,20120-1612,...,,No Medication Prescribed,,10,,,10,20,10,FemaleToFemale
4,1342992,BDD8C99E-9E01-E211-B5F5-78E7D18CFD3C,A - Shift,M438,38,West Centreville,6001 ODay Drive,Centreville,Virginia,20120-1612,...,433776.0,Medication Presribed,125464.0,25,0A89F8EB-32DE-4505-9DCA-CF9C68E09DCE,9/9/2020 10:52,10,20,10,FemaleToFemale
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795189,898016,62D3C99E-9E01-E211-B5F5-78E7D18CFD3C,C - Shift,M423,23,West Annandale,8914 Little River Turnpike,Fairfax,Virginia,22031-3123,...,,No Medication Prescribed,,10,,,20,10,10,MaleToMale
795190,899635,62D3C99E-9E01-E211-B5F5-78E7D18CFD3C,C - Shift,M423,23,West Annandale,8914 Little River Turnpike,Fairfax,Virginia,22031-3123,...,402822.0,Medication Presribed,4337.0,205,62D3C99E-9E01-E211-B5F5-78E7D18CFD3C,4/21/2019 6:37,20,10,10,MaleToMale
795191,899635,62D3C99E-9E01-E211-B5F5-78E7D18CFD3C,C - Shift,M423,23,West Annandale,8914 Little River Turnpike,Fairfax,Virginia,22031-3123,...,402822.0,Medication Presribed,4337.0,205,62D3C99E-9E01-E211-B5F5-78E7D18CFD3C,4/21/2019 6:37,20,10,10,MaleToMale
795192,899644,62D3C99E-9E01-E211-B5F5-78E7D18CFD3C,C - Shift,M423,23,West Annandale,8914 Little River Turnpike,Fairfax,Virginia,22031-3123,...,,No Medication Prescribed,,10,,,20,10,10,MaleToMale


In [47]:
df = df[['Medication_Given_Description', 'Procedure_Performed_Description', 'Interaction']]
df

Unnamed: 0,Medication_Given_Description,Procedure_Performed_Description,Interaction
0,10,10,FemaleToFemale
1,10,10,FemaleToFemale
2,10,10,FemaleToFemale
3,10,40,FemaleToFemale
4,25,70,FemaleToFemale
...,...,...,...
795189,10,10,MaleToMale
795190,205,40,MaleToMale
795191,205,130,MaleToMale
795192,10,10,MaleToMale


In [48]:
dataset = df.values.tolist()

In [49]:
model = summarize_by_class(dataset)
model

{'FemaleToFemale': [(57.45525103080281, 78.32643356777355, 61845)],
 'MaleToMale': [(71.47505517522526, 85.48403337077711, 355685)],
 'Mixed': [(60.6614212633452, 79.71946605614195, 377664)]}

In [50]:
m=0.2*len(df)
test = df.sample(n=int(m))
test

Unnamed: 0,Medication_Given_Description,Procedure_Performed_Description,Interaction
682999,10,10,MaleToMale
72300,10,10,Mixed
455109,10,130,Mixed
240964,10,10,FemaleToFemale
212548,10,130,MaleToMale
...,...,...,...
225133,10,40,Mixed
710845,10,40,Mixed
373446,115,310,Mixed
32650,55,790,Mixed


In [51]:
test = test.values.tolist()

In [52]:
count = 0
s = len(test)
for row in test:
    label = predict(model, row[:-1])
    if label == row[-1]:
        count+=1
print('Accuracy: %s' % (count/s))

Accuracy: 0.4838906424879589
