# Analysing Dataset

In [1]:
import pandas as pd
import pandas_profiling
import requests
import os
import json
import time
import datetime

In [28]:
directory_liar_dataset = "../liar_dataset"
directory_statements = f"{directory_liar_dataset}/statements"
directory_visualizations = "../visualizations"

In [3]:
df_train = pd.read_csv(f"{directory_liar_dataset}/train.tsv", sep='\t')
df_train.columns = ['statement_id', 'label', 'statement', 'subject', 'speaker', 'speakers_job_title', 'state_info', 'party_affiliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context']

In [4]:
df_valid = pd.read_csv(f"{directory_liar_dataset}/valid.tsv", sep='\t')
df_valid.columns = ['statement_id', 'label', 'statement', 'subject', 'speaker', 'speakers_job_title', 'state_info', 'party_affiliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context']

In [5]:
df = pd.concat([df_train, df_valid],ignore_index=True)
df.statement_id = df.statement_id.apply(lambda x: x[:-5])  # remove .json and get just ID

In [6]:
df.head(3)

Unnamed: 0,statement_id,label,statement,subject,speaker,speakers_job_title,state_info,party_affiliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context
0,10540,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
1,324,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
2,1123,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release


## One row analysis

Let's analyse first row, statement with id `1`. What is the information we get there?

In [17]:
sid = '1'

In [18]:
df[df.statement_id == sid]

Unnamed: 0,statement_id,label,statement,subject,speaker,speakers_job_title,state_info,party_affiliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context
9403,1,pants-fire,The attorney general requires that rape victim...,"crime,women",barbara-ann-radnofsky,,,democrat,0.0,0.0,0.0,0.0,1.0,in a Web site video


In [19]:
with open(f"{directory_statements}/{sid}.json", "r") as f:
    data = json.load(f)
data

{'art': [{'brightcove': '',
   'caption': "We're Going to Need a Bigger List",
   'id': 257,
   'infogram': '',
   'ndn': '',
   'ndnid': '',
   'other': '',
   'photo': None,
   'resource_type': {'id': 2,
    'name': 'YouTube',
    'resource_uri': '/api/v/2/mediatype/2/'},
   'resource_uri': '/api/v/2/media/257/',
   'title': 'Radnofsky video',
   'wibbitz': '',
   'youtube': '<iframe src="//www.youtube.com/embed/OrKDN_FL4iM?rel=0&wmode=opaque" frameborder="0" allowfullscreen></iframe>',
   'youtubeID': 'OrKDN_FL4iM'}],
 'author': [{'email_address': 'mashford-grooms@statesman.com',
   'first_name': 'Meghan',
   'id': 118,
   'last_name': 'Ashford-Grooms',
   'name_slug': 'meghan-ashford-grooms',
   'on_staff_page': None,
   'phone': None,
   'photo': None,
   'publication': {'id': 3,
    'publication_name': 'Austin American-Statesman',
    'resource_uri': '/api/v/2/publication/3/'},
   'resource_uri': '/api/v/2/staffer/118/',
   'title': 'PolitiFact Texas staff writer',
   'twitter': 

Just to visualize JSON hierarchy, run the following cell:

In [38]:
def go_further(dic, name):
    dict_vis = {"name": name, "children": []}
    for k, v in dic.items():
        if type(v) == str:
            new_el = {"name": k}
        elif type(v) == list:
            if len(v) > 0:
                new_el = go_further(v[0], k)
        elif type(v) == dict:
            new_el = go_further(v, k)
        else:
            new_el = {"name": k}
        dict_vis["children"].append(new_el)
        
    return dict_vis

my_dict = go_further(data, name="statement_info")

with open(f"{directory_visualizations}/data.json", "w") as f:
    json.dump(my_dict, f)

print(f"Checkout visualization by: \n1) cd ../visualizations \n2) python -m http.server \n3) in browser, open: http://localhost:8000/")

Checkout visualization by: 
1) cd ../visualizations 
2) python -m http.server 
3) in browser, open: http://localhost:8000/
