In [26]:
import pandas as pd
import re
import json

## Load Json format data and Get only needed columns 

In [38]:
file_name = "output.json"

In [39]:
json_read = []

with open (file_name) as f:
    for line in f:
        data = json.loads(line)
        # get the task_name
        keylist = list()
        for i in data.keys():
            keylist.append(i)
        task_name = keylist[2]

        json_read.append([data['source'],
                         data[task_name]['class-name'],
                         data[task_name]['confidence'],
                         data[task_name]['human-annotated']
                         ])

## Create a DataFrame and Rename the Column

In [40]:
col_names =  ['source','is_insincere','confidence','human_annotated']
df= pd.DataFrame(json_read,columns=col_names)

In [41]:
df.shape

(200, 4)

In [42]:
df.head()

Unnamed: 0,source,is_insincere,confidence,human_annotated
0,"""is this question insincere? [question: As a ...",sincere,0.93,yes
1,"""is this question insincere? [question: If yo...",insincere,0.78,yes
2,is this question insincere? [question: Are co...,insincere,0.95,yes
3,is this question insincere? [question: Do you...,sincere,0.95,yes
4,is this question insincere? [question: Why he...,insincere,0.95,yes


## Parse the ID out to match the original data

In [48]:
id = []
pattern = r'.*\[id: (.*) \].*'

for i in range(df.shape[0]):
    extraction_id=re.findall(pattern,df['source'][i])
    if extraction_id:
        id.append(extraction_id[0])
    else:
        id.append('')


In [49]:
len(id)

200

In [50]:
df['id'] = id

In [51]:
df_label = df.copy()

In [52]:
df_label.head()

Unnamed: 0,source,is_insincere,confidence,human_annotated,id
0,"""is this question insincere? [question: As a ...",sincere,0.93,yes,4fdccb3dfb686f3c3ecd
1,"""is this question insincere? [question: If yo...",insincere,0.78,yes,175b23a9b8015fb287c0
2,is this question insincere? [question: Are co...,insincere,0.95,yes,2efa3ddc4b94bb610008
3,is this question insincere? [question: Do you...,sincere,0.95,yes,2039fe8508d2e5faa5a5
4,is this question insincere? [question: Why he...,insincere,0.95,yes,4f8f18e80d3b3c7e4227


## Load the original dataset that has label

In [53]:
df_unlabel = pd.read_csv('train_quora.csv')

In [54]:
df_unlabel.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0.0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0.0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0.0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0.0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0.0


In [55]:
df_unlabel.shape

(444896, 3)

In [56]:
df_unlabel['id'] = df_unlabel.qid.astype(str)

In [57]:
df_unlabel.shape

(444896, 4)

In [58]:
df_unlabel.head()

Unnamed: 0,qid,question_text,target,id
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0.0,00002165364db923c7e6
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0.0,000032939017120e6e44
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0.0,0000412ca6e4628ce2cf
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0.0,000042bf85aa498cd78e
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0.0,0000455dfa3e01eae3af


## Left Outer join df_label and df_unlabel 

In [59]:
df_merge = df_label.merge(df_unlabel, how='left', on=['id'])


In [60]:
df_merge.head()

Unnamed: 0,source,is_insincere,confidence,human_annotated,id,qid,question_text,target
0,"""is this question insincere? [question: As a ...",sincere,0.93,yes,4fdccb3dfb686f3c3ecd,4fdccb3dfb686f3c3ecd,"As a South African Cricket fan, what would be ...",0.0
1,"""is this question insincere? [question: If yo...",insincere,0.78,yes,175b23a9b8015fb287c0,175b23a9b8015fb287c0,"If you are a ""content writer"", exactly what ar...",0.0
2,is this question insincere? [question: Are co...,insincere,0.95,yes,2efa3ddc4b94bb610008,2efa3ddc4b94bb610008,Are conservatives just unintelligent liberals?,1.0
3,is this question insincere? [question: Do you...,sincere,0.95,yes,2039fe8508d2e5faa5a5,2039fe8508d2e5faa5a5,Do you still like the Overwatch now?,0.0
4,is this question insincere? [question: Why he...,insincere,0.95,yes,4f8f18e80d3b3c7e4227,4f8f18e80d3b3c7e4227,Why he wanted PM to have sex with pig?,1.0


In [61]:
df_merge.shape

(200, 8)

In [73]:
df_final=df_merge.dropna()
df_final = df_final.reset_index()
del df_final['index']

In [74]:
df_final.head()

Unnamed: 0,source,is_insincere,confidence,human_annotated,id,qid,question_text,target
0,"""is this question insincere? [question: As a ...",sincere,0.93,yes,4fdccb3dfb686f3c3ecd,4fdccb3dfb686f3c3ecd,"As a South African Cricket fan, what would be ...",0.0
1,"""is this question insincere? [question: If yo...",insincere,0.78,yes,175b23a9b8015fb287c0,175b23a9b8015fb287c0,"If you are a ""content writer"", exactly what ar...",0.0
2,is this question insincere? [question: Are co...,insincere,0.95,yes,2efa3ddc4b94bb610008,2efa3ddc4b94bb610008,Are conservatives just unintelligent liberals?,1.0
3,is this question insincere? [question: Do you...,sincere,0.95,yes,2039fe8508d2e5faa5a5,2039fe8508d2e5faa5a5,Do you still like the Overwatch now?,0.0
4,is this question insincere? [question: Why he...,insincere,0.95,yes,4f8f18e80d3b3c7e4227,4f8f18e80d3b3c7e4227,Why he wanted PM to have sex with pig?,1.0


## Factorize the label column 'is_insincere' and 'target' to normalize label to 0 and 1

In [77]:
df_final['is_insincere']=pd.factorize(df_final['is_insincere'], sort=False)[0]

In [78]:
df_final['target']=pd.factorize(df_final.target, sort=True)[0]

In [79]:
df_final.head()

Unnamed: 0,source,is_insincere,confidence,human_annotated,id,qid,question_text,target
0,"""is this question insincere? [question: As a ...",0,0.93,yes,4fdccb3dfb686f3c3ecd,4fdccb3dfb686f3c3ecd,"As a South African Cricket fan, what would be ...",0
1,"""is this question insincere? [question: If yo...",1,0.78,yes,175b23a9b8015fb287c0,175b23a9b8015fb287c0,"If you are a ""content writer"", exactly what ar...",0
2,is this question insincere? [question: Are co...,1,0.95,yes,2efa3ddc4b94bb610008,2efa3ddc4b94bb610008,Are conservatives just unintelligent liberals?,1
3,is this question insincere? [question: Do you...,0,0.95,yes,2039fe8508d2e5faa5a5,2039fe8508d2e5faa5a5,Do you still like the Overwatch now?,0
4,is this question insincere? [question: Why he...,1,0.95,yes,4f8f18e80d3b3c7e4227,4f8f18e80d3b3c7e4227,Why he wanted PM to have sex with pig?,1


## Load sklearn metrics to Evaluate Data Quality

In [80]:
from sklearn.metrics import confusion_matrix

actual = df_final.target
predicted = df_final.is_insincere
tn, fp, fn, tp = confusion_matrix(actual, predicted).ravel()

In [81]:
confusion_matrix(actual, predicted)

array([[140,  10],
       [ 13,  37]])

In [82]:
print('True Positive: ',tp)
print('False Positive: ',fp)
print('True Negative: ',tn)
print('False Negative: ',fn)


True Positive:  37
False Positive:  10
True Negative:  140
False Negative:  13


In [83]:
accuracy = (tp + tn) / (tp+fp+tn+fn)
precision = tp / (tp+fp)
recall = tp/(tp+fn)
f1 = 2*(precision*recall)/(precision+recall)
false_negative_rate = fn/(fn+tp)
false_positive_rate = fp/(fp+tn)

In [85]:
print('Accuracy Score :',accuracy )
print('Precision:',precision )
print('Recall:',recall )
print('F1 Score:',f1 )
print('False Negative Rate:',false_negative_rate )
print('False Positive Rate:',false_positive_rate )

Accuracy Score : 0.885
Precision: 0.7872340425531915
Recall: 0.74
F1 Score: 0.7628865979381443
False Negative Rate: 0.26
False Positive Rate: 0.06666666666666667
