In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
import numpy as np

In [3]:
from sklearn.naive_bayes import BernoulliNB

In [4]:
from math import exp

## The Data
This dataset is from [Kaggle: Diagnose Specific Language Impairment in Children](https://www.kaggle.com/dgokeeffe/specific-language-impairment/data). It is a collection of three separate datasets which all consist of narratives from a child (adolescent and younger) attempting to complete a wordless picture task. Based on the narratives, features have been given to describe the fluency of their response. The goal is to be able to automate the diagnosis of "Specific Language Impairment" given these features. Specific Language Impairment (SLI) is a condition that effects 7% of 5-year-old children and is characterized by a lack of language ability in comparison to peers but with no obvious mental or physical disability

The dataset contains 1163 instances and 64 features (60 numerical, 4 categorical). A decription of these features can be found [here](https://www.kaggle.com/dgokeeffe/specific-language-impairment).

Class Lables:
0 = No/Subject does not have SLI
1 = Yes/Subject has SLI

In [5]:
df = pd.read_csv("~/Downloads/all_data_R.csv")

In [6]:
df = df.drop('group', axis=1)
df = df.drop('corpus', axis=1)
df = df.drop('filename', axis=1)
df = df.drop('age', axis=1)

### Binarizing dataset

In [7]:
gender_to_int = {"male":0, "female":1}
df["sex"] =df["sex"].replace(gender_to_int)

In [8]:
for i in list(df)[1:]:
    mean = df[i].mean()
    df[i] = np.where(df[i]> mean, 1, 0)

In [9]:
targets = df['Y']
df.drop('Y',axis=1)
train_x, test_x, train_y, test_y = train_test_split(df, targets, test_size = 0.33)
print("TRAINING SET\n")
print(train_x.info(verbose=False), "\n\n")
print("TESTING SET\n")
print(test_x.info(verbose=False))

TRAINING SET

<class 'pandas.core.frame.DataFrame'>
Int64Index: 779 entries, 676 to 329
Columns: 60 entries, Y to total_error
dtypes: int64(60)
memory usage: 371.2 KB
None 


TESTING SET

<class 'pandas.core.frame.DataFrame'>
Int64Index: 384 entries, 804 to 868
Columns: 60 entries, Y to total_error
dtypes: int64(60)
memory usage: 183.0 KB
None


In [10]:
clf1 = BernoulliNB()

In [11]:
clf1 = clf1.fit(train_x,train_y)

In [12]:
tester = clf1.predict(test_x)
print("Accuracy on training set:", accuracy_score(test_y,tester)*100)


Accuracy on training set: 85.9375


In [13]:
mostPosNeg = clf1.predict_proba(test_x)

In [14]:
mostPos = 0
mostPosIndx = 0 
mostNeg = 0
mostNegIndx = 0
for i in range(len(mostPosNeg)):
    if mostPosNeg[i][1] > mostPos:
        mostPos = mostPosNeg[i][1]
        mostPosIndx = i        
    elif mostPosNeg[i][0] > mostNeg:
        mostNeg = mostPosNeg[i][0]
        mostNegIndx = i 

print("The most positive object is ",mostPosIndx)
print("The most negative object is ",mostNegIndx)

The most positive object is  274
The most negative object is  102


In [15]:
xTyT = []
xTyF = []
xFyT = []
xFyF = []
for i in range(len(clf1.feature_log_prob_[1])):
    xTyT.append((exp(clf1.feature_log_prob_[1][i])))
for i in range(len(clf1.feature_log_prob_[0])):
    xTyF.append((exp(clf1.feature_log_prob_[0][i])))
for i in range(len(xTyT)):
    xFyT.append(1 - xTyT[i])
for i in range(len(xTyT)):
    xFyF.append(1 - xTyF[i])

pos_class_ratio = []
for i in range(len(xTyT)):
    pos_class_ratio.append(np.log(xTyT[i]/xFyT[i]))
neg_class_ratio = []
for i in range(len(xTyF)):
    neg_class_ratio.append(np.log(xFyT[i]/xFyF[i]))
total_ratio = [neg_class_ratio,pos_class_ratio]

## 1. The most positive object with respect to the probabilities.

#### a) the total positive log evidence

#### b) the total negative log evidence

In [16]:
positive_log_evidence_1 = 0
positive_evidence_1 = []
negative_log_evidence_1 = 0
negative_evidence_1 = []
for i, objFeatureValue in enumerate(df.loc[mostPosIndx]):
    if (total_ratio[objFeatureValue][i] > 0):
        positive_log_evidence_1 =+ total_ratio[objFeatureValue][i]
        positive_evidence_1.append((i,total_ratio[objFeatureValue][i]))
    else:
        negative_log_evidence_1 =+ total_ratio[objFeatureValue][i]
        negative_evidence_1.append((i,total_ratio[objFeatureValue][i]))
    

In [17]:
print("Total positive log evidence for the most positive object is", positive_log_evidence_1) 
print("Total negative log evidence for the most positive object is", negative_log_evidence_1)

Total positive log evidence for the most positive object is 0.286197519209
Total negative log evidence for the most positive object is -1.01554568997


#### c) probabilty distribution

In [18]:
print("Probability distribution is", mostPosNeg[mostPosIndx])

Probability distribution is [  4.76717253e-10   1.00000000e+00]


#### d) top 3 features values that contribute most to the positive evidence

In [19]:
indx1 = 0
indx2 = 0
indx3 = 0
first = 0
second = 0
third = 0

for i in range(len(positive_evidence_1)):
    if (positive_evidence_1[i][1] > third and positive_evidence_1[i][1] > second and positive_evidence_1[i][1] > first):
        third = second 
        indx3 = indx2
        second = first
        indx2 = indx1
        first = positive_evidence_1[i][1]
        indx1 = positive_evidence_1[i][0]
    elif (positive_evidence_1[i][1] > third and positive_evidence_1[i][1] > second):
        third = second 
        indx3 = indx2
        second = positive_evidence_1[i][1]
        indx2 = positive_evidence_1[i][0]
    elif(positive_evidence_1[i][1] > third):
        third = positive_evidence_1[i][1]
        indx3 = positive_evidence_1[i][0]
        
print("First top feature for positive evidence:" ,list(df)[indx1])
print("Second top feature for positive evidence:" ,list(df)[indx2])
print("Third top feature for positive evidence:" ,list(df)[indx3])

First top feature for positive evidence: mlu_morphemes
Second top feature for positive evidence: verb_utt
Third top feature for positive evidence: dss


#### e) top 3 feature values that contribute the most to the negative evidence.

In [20]:
indx1 = 0
indx2 = 0
indx3 = 0
first = 0
second = 0
third = 0

for i in range(len(negative_evidence_1)):
    if (abs(negative_evidence_1[i][1]) > third and abs(negative_evidence_1[i][1]) > second and abs(negative_evidence_1[i][1]) > first):
        third = second 
        indx3 = indx2
        second = first
        indx2 = indx1
        first = negative_evidence_1[i][1]
        indx1 = negative_evidence_1[i][0]
    elif (abs(negative_evidence_1[i][1]) > third and abs(negative_evidence_1[i][1]) > second):
        third = second 
        indx3 = indx2
        second = negative_evidence_1[i][1]
        indx2 = negative_evidence_1[i][0]
    elif(abs(negative_evidence_1[i][1]) > third):
        third = negative_evidence_1[i][1]
        indx3 = negative_evidence_1[i][0]

print("First top feature for negative evidence:" ,list(df)[indx1])
print("Second top feature for negative evidence:" ,list(df)[indx2])
print("Third top feature for negative evidence:" ,list(df)[indx3])

First top feature for negative evidence: total_error
Second top feature for negative evidence: pro_aux
Third top feature for negative evidence: det_pl_n


## 2. The most negative object with respect to the probabilities.

#### a) the total positive log evidence

#### b) the total negative log evidence

In [21]:
positive_log_evidence_2 = 0
positive_evidence_2 = []
negative_log_evidence_2 = 0
negative_evidence_2 = []
for i, objFeatureValue in enumerate(df.loc[mostNegIndx]):
    if (total_ratio[objFeatureValue][i] > 0):
        positive_log_evidence_2 =+ total_ratio[objFeatureValue][i]
        positive_evidence_2.append((i,total_ratio[objFeatureValue][i]))
    else:
        negative_log_evidence_2 =+ total_ratio[objFeatureValue][i]
        negative_evidence_2.append((i,total_ratio[objFeatureValue][i]))

In [22]:
print("Total positive log evidence for the most negative object is", positive_log_evidence_2) 
print("Total negative log evidence for the most negative object is", negative_log_evidence_2)

Total positive log evidence for the most negative object is 0.215476490795
Total negative log evidence for the most negative object is -0.00196136181343


#### c) probabilty distribution

In [23]:
print("Probability distribution is", mostPosNeg[mostNegIndx])

Probability distribution is [  1.00000000e+00   4.31237495e-12]


#### d) top 3 features values that contribute most to the positive evidence

In [24]:
indx1 = 0
indx2 = 0
indx3 = 0
first = 0
second = 0
third = 0

for i in range(len(positive_evidence_2)):
    if (positive_evidence_2[i][1] > third and positive_evidence_2[i][1] > second and positive_evidence_2[i][1] > first):
        third = second 
        indx3 = indx2
        second = first
        indx2 = indx1
        first = positive_evidence_2[i][1]
        indx1 = positive_evidence_2[i][0]
    elif (positive_evidence_2[i][1] > third and positive_evidence_2[i][1] > second):
        third = second 
        indx3 = indx2
        second = positive_evidence_2[i][1]
        indx2 = positive_evidence_2[i][0]
    elif(positive_evidence_2[i][1] > third):
        third = positive_evidence_2[i][1]
        indx3 = positive_evidence_2[i][0]
        
print("First top feature for positive evidence:" ,list(df)[indx1])
print("Second top feature for positive evidence:" ,list(df)[indx2])
print("Third top feature for positive evidence:" ,list(df)[indx3])

First top feature for positive evidence: ipsyn_total
Second top feature for positive evidence: dss
Third top feature for positive evidence: f_k


#### e) top 3 feature values that contribute the most to the negative evidence.

In [25]:
indx1 = 0
indx2 = 0
indx3 = 0
first = 0
second = 0
third = 0

for i in range(len(negative_evidence_2)):
    if (abs(negative_evidence_2[i][1]) > third and abs(negative_evidence_2[i][1]) > second and abs(negative_evidence_2[i][1]) > first):
        third = second 
        indx3 = indx2
        second = first
        indx2 = indx1
        first = negative_evidence_2[i][1]
        indx1 = negative_evidence_2[i][0]
    elif (abs(negative_evidence_2[i][1]) > third and abs(negative_evidence_2[i][1]) > second):
        third = second 
        indx3 = indx2
        second = negative_evidence_2[i][1]
        indx2 = negative_evidence_2[i][0]
    elif(abs(negative_evidence_2[i][1]) > third):
        third = negative_evidence_2[i][1]
        indx3 = negative_evidence_2[i][0]

print("First top feature for negative evidence:" ,list(df)[indx1])
print("Second top feature for negative evidence:" ,list(df)[indx2])
print("Third top feature for negative evidence:" ,list(df)[indx3])

First top feature for negative evidence: pro_aux
Second top feature for negative evidence: det_pl_n
Third top feature for negative evidence: det_n_pl


## 3. The object that has the largest positive evidence.

In [26]:
positive_log_evidence_tot = []
negative_log_evidence_tot = []
positive_log_evidence_t = 0
negative_log_evidence_t = 0

test = pd.DataFrame(test_x)
for obj in test_x.iterrows():
    for i, objFeatureValue in enumerate(df.loc[obj[0]]):
        if (total_ratio[objFeatureValue][i] > 0):
            positive_log_evidence_t =+ total_ratio[objFeatureValue][i]
        else:
            negative_log_evidence_t =+ total_ratio[objFeatureValue][i]
    positive_log_evidence_tot.append((obj[0],positive_log_evidence_t))
    negative_log_evidence_tot.append((obj[0],negative_log_evidence_t))
    positive_log_evidence_t = 0
    negative_log_evidence_t = 0

most_pos_evidence = 0
most_pos_evidence_indx = 0
most_neg_evidence = 0
most_neg_evidence_indx = 0

index=0
for i in positive_log_evidence_tot:
    index =+1
    if i[1] > most_pos_evidence:
        most_pos_evidence_indx = test.index.get_loc(i[0])
        most_pos_evidence = i[1]

index=0
for i in negative_log_evidence_tot:
    index =+1
    if i[1] < most_neg_evidence:
        most_neg_evidence_indx = test.index.get_loc(i[0])
        most_neg_evidence = i[1]

print("Object with most positive log evidence is ",most_pos_evidence_indx)
print("Object with most negative log evidence is ",most_neg_evidence_indx)

Object with most positive log evidence is  64
Object with most negative log evidence is  110


#### a) the total positive log evidence

#### b) the total negative log evidence

In [27]:
positive_log_evidence_3 = 0
positive_evidence_3 = []
negative_log_evidence_3 = 0
negative_evidence_3 = []
for i, objFeatureValue in enumerate(df.loc[most_pos_evidence_indx]):
    if (total_ratio[objFeatureValue][i] > 0):
        positive_log_evidence_3 =+ total_ratio[objFeatureValue][i]
        positive_evidence_3.append((i,total_ratio[objFeatureValue][i]))
    else:
        negative_log_evidence_3 =+ total_ratio[objFeatureValue][i]
        negative_evidence_3.append((i,total_ratio[objFeatureValue][i]))
    

In [28]:
print("Total positive log evidence for the most negative object is", positive_log_evidence_3) 
print("Total negative log evidence for the most negative object is", negative_log_evidence_3)

Total positive log evidence for the most negative object is 0.215476490795
Total negative log evidence for the most negative object is -0.00196136181343


#### c) probabilty distribution

In [29]:
print("Probability distribution is", mostPosNeg[most_pos_evidence_indx])

Probability distribution is [  9.99999320e-01   6.79714297e-07]


#### d) top 3 features values that contribute most to the positive evidence

In [30]:
indx1 = 0
indx2 = 0
indx3 = 0
first = 0
second = 0
third = 0

for i in range(len(positive_evidence_3)):
    if (positive_evidence_3[i][1] > third and positive_evidence_3[i][1] > second and positive_evidence_3[i][1] > first):
        third = second 
        indx3 = indx2
        second = first
        indx2 = indx1
        first = positive_evidence_3[i][1]
        indx1 = positive_evidence_3[i][0]
    elif (positive_evidence_3[i][1] > third and positive_evidence_3[i][1] > second):
        third = second 
        indx3 = indx2
        second = positive_evidence_3[i][1]
        indx2 = positive_evidence_3[i][0]
    elif(positive_evidence_3[i][1] > third):
        third = positive_evidence_3[i][1]
        indx3 = positive_evidence_3[i][0]
        
print("First top feature for positive evidence:" ,list(df)[indx1])
print("Second top feature for positive evidence:" ,list(df)[indx2])
print("Third top feature for positive evidence:" ,list(df)[indx3])

First top feature for positive evidence: ipsyn_total
Second top feature for positive evidence: regular_past_ed
Third top feature for positive evidence: num_pos_tags


#### e) top 3 feature values that contribute the most to the negative evidence.

In [31]:
indx1 = 0
indx2 = 0
indx3 = 0
first = 0
second = 0
third = 0

for i in range(len(negative_evidence_3)):
    if (abs(negative_evidence_3[i][1]) > third and abs(negative_evidence_3[i][1]) > second and abs(negative_evidence_3[i][1]) > first):
        third = second 
        indx3 = indx2
        second = first
        indx2 = indx1
        first = negative_evidence_3[i][1]
        indx1 = negative_evidence_3[i][0]
    elif (abs(negative_evidence_3[i][1]) > third and abs(negative_evidence_3[i][1]) > second):
        third = second 
        indx3 = indx2
        second = negative_evidence_3[i][1]
        indx2 = negative_evidence_3[i][0]
    elif(abs(negative_evidence_3[i][1]) > third):
        third = negative_evidence_3[i][1]
        indx3 = negative_evidence_3[i][0]

print("First top feature for negative evidence:" ,list(df)[indx1])
print("Second top feature for negative evidence:" ,list(df)[indx2])
print("Third top feature for negative evidence:" ,list(df)[indx3])

First top feature for negative evidence: pro_aux
Second top feature for negative evidence: det_pl_n
Third top feature for negative evidence: n_v


## 4. The object that has the largest (in magnitude) negative evidence.

#### a) the total positive log evidence

#### b) the total negative log evidence

In [32]:
positive_log_evidence_4 = 0
positive_evidence_4 = []
negative_log_evidence_4 = 0
negative_evidence_4 = []
for i, objFeatureValue in enumerate(df.loc[most_neg_evidence_indx]):
    if (total_ratio[objFeatureValue][i] > 0):
        positive_log_evidence_4 =+ total_ratio[objFeatureValue][i]
        positive_evidence_4.append((i,total_ratio[objFeatureValue][i]))
    else:
        negative_log_evidence_4 =+ total_ratio[objFeatureValue][i]
        negative_evidence_4.append((i,total_ratio[objFeatureValue][i]))
    

In [33]:
print("Total positive log evidence for the most negative object is", positive_log_evidence_4) 
print("Total negative log evidence for the most negative object is", negative_log_evidence_4)

Total positive log evidence for the most negative object is 0.215476490795
Total negative log evidence for the most negative object is -0.00196136181343


#### c) probabilty distribution

In [34]:
print("Probability distribution is", mostPosNeg[most_neg_evidence_indx])

Probability distribution is [  9.99998620e-01   1.38041818e-06]


#### d) top 3 features values that contribute most to the positive evidence

In [35]:
indx1 = 0
indx2 = 0
indx3 = 0
first = 0
second = 0
third = 0

for i in range(len(positive_evidence_4)):
    if (positive_evidence_4[i][1] > third and positive_evidence_4[i][1] > second and positive_evidence_4[i][1] > first):
        third = second 
        indx3 = indx2
        second = first
        indx2 = indx1
        first = positive_evidence_4[i][1]
        indx1 = positive_evidence_4[i][0]
    elif (positive_evidence_4[i][1] > third and positive_evidence_4[i][1] > second):
        third = second 
        indx3 = indx2
        second = positive_evidence_4[i][1]
        indx2 = positive_evidence_4[i][0]
    elif(positive_evidence_4[i][1] > third):
        third = positive_evidence_4[i][1]
        indx3 = positive_evidence_4[i][0]
        
print("First top feature for positive evidence:" ,list(df)[indx1])
print("Second top feature for positive evidence:" ,list(df)[indx2])
print("Third top feature for positive evidence:" ,list(df)[indx3])

First top feature for positive evidence: ipsyn_total
Second top feature for positive evidence: dss
Third top feature for positive evidence: regular_past_ed


#### e) top 3 feature values that contribute the most to the negative evidence.

In [36]:
indx1 = 0
indx2 = 0
indx3 = 0
first = 0
second = 0
third = 0

for i in range(len(negative_evidence_4)):
    if (abs(negative_evidence_4[i][1]) > third and abs(negative_evidence_4[i][1]) > second and abs(negative_evidence_4[i][1]) > first):
        third = second 
        indx3 = indx2
        second = first
        indx2 = indx1
        first = negative_evidence_4[i][1]
        indx1 = negative_evidence_4[i][0]
    elif (abs(negative_evidence_4[i][1]) > third and abs(negative_evidence_4[i][1]) > second):
        third = second 
        indx3 = indx2
        second = negative_evidence_4[i][1]
        indx2 = negative_evidence_4[i][0]
    elif(abs(negative_evidence_4[i][1]) > third):
        third = negative_evidence_4[i][1]
        indx3 = negative_evidence_4[i][0]

print("First top feature for negative evidence:" ,list(df)[indx1])
print("Second top feature for negative evidence:" ,list(df)[indx2])
print("Third top feature for negative evidence:" ,list(df)[indx3])

First top feature for negative evidence: pro_aux
Second top feature for negative evidence: det_pl_n
Third top feature for negative evidence: n_v


### 5. The most uncertain object (the probabilities are closest to 0.5)

In [37]:
mostUncertain = 1
mostUncertainIndx = 0 
for i in range(len(mostPosNeg)):
    if abs(mostPosNeg[i][1] - mostPosNeg[i][0]) < abs(mostUncertain):
        mostUncertain = mostPosNeg[i][1] - mostPosNeg[i][0]
        mostUncertainIndx = i        

print("The most uncertain object is ",mostUncertainIndx)

The most uncertain object is  185


#### a) the total positive log evidence

#### b) the total negative log evidence

In [38]:
positive_log_evidence_5 = 0
positive_evidence_5 = []
negative_log_evidence_5 = 0
negative_evidence_5 = []
for i, objFeatureValue in enumerate(df.loc[mostUncertainIndx]):
    if (total_ratio[objFeatureValue][i] > 0):
        positive_log_evidence_5 =+ total_ratio[objFeatureValue][i]
        positive_evidence_5.append((i,total_ratio[objFeatureValue][i]))
    else:
        negative_log_evidence_5 =+ total_ratio[objFeatureValue][i]
        negative_evidence_5.append((i,total_ratio[objFeatureValue][i]))
    

In [39]:
print("Total positive log evidence for the most negative object is", positive_log_evidence_5) 
print("Total negative log evidence for the most negative object is", negative_log_evidence_5)

Total positive log evidence for the most negative object is 0.286197519209
Total negative log evidence for the most negative object is -1.01554568997


#### c) probabilty distribution

In [40]:
print("Probability distribution is", mostPosNeg[mostUncertainIndx])

Probability distribution is [ 0.50186898  0.49813102]


#### d) top 3 features values that contribute most to the positive evidence

In [41]:
indx1 = 0
indx2 = 0
indx3 = 0
first = 0
second = 0
third = 0

for i in range(len(positive_evidence_5)):
    if (positive_evidence_5[i][1] > third and positive_evidence_5[i][1] > second and positive_evidence_5[i][1] > first):
        third = second 
        indx3 = indx2
        second = first
        indx2 = indx1
        first = positive_evidence_5[i][1]
        indx1 = positive_evidence_5[i][0]
    elif (positive_evidence_5[i][1] > third and positive_evidence_5[i][1] > second):
        third = second 
        indx3 = indx2
        second = positive_evidence_5[i][1]
        indx2 = positive_evidence_5[i][0]
    elif(positive_evidence_5[i][1] > third):
        third = positive_evidence_5[i][1]
        indx3 = positive_evidence_5[i][0]
        
print("First top feature for positive evidence:" ,list(df)[indx1])
print("Second top feature for positive evidence:" ,list(df)[indx2])
print("Third top feature for positive evidence:" ,list(df)[indx3])

First top feature for positive evidence: Y
Second top feature for positive evidence: z_mlu_sli
Third top feature for positive evidence: z_mlu_td


In [42]:
indx1 = 0
indx2 = 0
indx3 = 0
first = 0
second = 0
third = 0

for i in range(len(negative_evidence_5)):
    if (abs(negative_evidence_5[i][1]) > third and abs(negative_evidence_5[i][1]) > second and abs(negative_evidence_5[i][1]) > first):
        third = second 
        indx3 = indx2
        second = first
        indx2 = indx1
        first = negative_evidence_5[i][1]
        indx1 = negative_evidence_5[i][0]
    elif (abs(negative_evidence_5[i][1]) > third and abs(negative_evidence_5[i][1]) > second):
        third = second 
        indx3 = indx2
        second = negative_evidence_5[i][1]
        indx2 = negative_evidence_5[i][0]
    elif(abs(negative_evidence_5[i][1]) > third):
        third = negative_evidence_5[i][1]
        indx3 = negative_evidence_5[i][0]

print("First top feature for negative evidence:" ,list(df)[indx1])
print("Second top feature for negative evidence:" ,list(df)[indx2])
print("Third top feature for negative evidence:" ,list(df)[indx3])

First top feature for negative evidence: total_error
Second top feature for negative evidence: pro_aux
Third top feature for negative evidence: det_pl_n
