# Analysing Dataset

In [1]:
import pandas as pd
import pandas_profiling
import requests
from functional import pseq
import pathlib
import os
import json
import time
import datetime

In [2]:
directory_liar_dataset = "../liar_dataset"
directory_statements = f"{directory_liar_dataset}/statements"
directory_visualizations = "../visualizations"

In [16]:
df = pd.concat([pd.read_csv(f"{directory_liar_dataset}/{part}.tsv", sep='\t', header=None) for part in ['train', 'valid']])
df.columns = ['statement_id', 'label', 'statement', 'subject', 'speaker', 'speakers_job_title', 'state_info', 'party_affiliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context']

df.statement_id = df.statement_id.apply(lambda x: x[:-5])  # remove .json and get just ID

In [4]:
df.head(10)

Unnamed: 0,statement_id,label,statement,subject,speaker,speakers_job_title,state_info,party_affiliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context
0,2635,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,1123,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
4,9028,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN
5,12465,true,The Chicago Bears have had more starting quart...,education,robin-vos,Wisconsin Assembly speaker,Wisconsin,republican,0.0,3.0,2.0,5.0,1.0,a an online opinion-piece
6,2342,barely-true,Jim Dunnam has not lived in the district he re...,candidates-biography,republican-party-texas,,Texas,republican,3.0,1.0,1.0,3.0,1.0,a press release.
7,153,half-true,I'm the only person on this stage who has work...,ethics,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,"a Democratic debate in Philadelphia, Pa."
8,5602,half-true,"However, it took $19.5 million in Oregon Lotte...",jobs,oregon-lottery,,,organization,0.0,0.0,1.0,0.0,1.0,a website
9,9741,mostly-true,Says GOP primary opponents Glenn Grothman and ...,"energy,message-machine-2014,voting-record",duey-stroebel,State representative,Wisconsin,republican,0.0,0.0,0.0,1.0,0.0,an online video


In [5]:
# form URL from statement ID
def get_URL(statement_id):
    return f"http://www.politifact.com/api/v/2/statement/{statement_id}/?format=json"

In [6]:
def extract_information(res):
    try:
        author = res['author']

        try:
            if len(author) > 0:
                author = author[0]['name_slug']
            else:
                author = None
        except Exception:
            print(author)

        return {'author_name_slug': author,
                'ruling_date':  res['ruling_date'],
                'statement_date' :res['statement_date'],
                'speaker_current_job': res['speaker']['current_job'],
                'speaker_first_name': res['speaker']['first_name'],
                'speaker_last_name': res['speaker']['last_name'],
                'speaker_home_state': res['speaker']['home_state'],
                'statement_id': res['id']
               }
    except KeyError:
        return {}

In [7]:
#with requests.Session() as session:
#    additional_information = statement_ids.map(lambda sid: session.get(get_URL(sid)))\
#                                          .filter(lambda r: r.ok)\
#                                          .map(lambda r: r.json())\
#                                          .map(extract_information)\
#                                          .to_pandas()

In [8]:
def safe_json_read(f):
    try:
        with open(f, 'r') as fc:
            return json.load(fc)
    except json.JSONDecodeError:
        print(f)
        return {}

In [12]:
additional_information = pseq(pathlib.Path('../liar_dataset/statements/').iterdir())\
                               .map(safe_json_read)\
                               .filter(lambda x: len(x) > 0)\
                               .map(extract_information)\
                               .to_pandas()

additional_information['statement_date'] = pd.to_datetime(additional_information['statement_date'])

../liar_dataset/statements/9.json
../liar_dataset/statements/5355.json


In [17]:
df['statement_id'] = pd.to_numeric(df['statement_id'])
lies = df.merge(additional_information, on='statement_id', how='left')

In [18]:
lies.loc[lies['speaker'] == 'barack-obama', ]['pants_on_fire_counts'].value_counts()

9.0    549
Name: pants_on_fire_counts, dtype: int64

In [19]:
lies[lies['speakers_job_title'].str.contains('County') == True].shape

(265, 21)

In [20]:
lies['statement_date'].describe()

count                   11521
unique                   2832
top       2011-01-25 00:00:00
freq                       22
first     1995-04-01 00:00:00
last      2016-12-11 00:00:00
Name: statement_date, dtype: object

In [21]:
pandas_profiling.ProfileReport(lies)

0,1
Number of variables,21
Number of observations,11524
Total Missing (%),2.4%
Total size in memory,1.9 MiB
Average record size in memory,176.0 B

0,1
Numeric,3
Categorical,14
Boolean,0
Date,1
Text (Unique),0
Rejected,3
Unsupported,0

0,1
Distinct count,293
Unique (%),2.5%
Missing (%),0.2%
Missing (n),26

0,1
louis-jacobson,1204
w-gardner-selby,684
angie-drobnic-holan,539
Other values (289),9071

Value,Count,Frequency (%),Unnamed: 3
louis-jacobson,1204,10.4%,
w-gardner-selby,684,5.9%,
angie-drobnic-holan,539,4.7%,
tom-kertscher,478,4.1%,
jon-greenberg,468,4.1%,
amy-sherman,432,3.7%,
robert-farley,399,3.5%,
c-eugene-emery,357,3.1%,
dave-umhoefer,310,2.7%,
katie-sanders,290,2.5%,

0,1
Distinct count,31
Unique (%),0.3%
Missing (%),0.0%
Missing (n),2
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,11.563
Minimum,0
Maximum,70
Zeros (%),29.3%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,2
Q3,12
95-th percentile,63
Maximum,70
Range,70
Interquartile range,12

0,1
Standard deviation,18.978
Coef of variation,1.6412
Kurtosis,3.0345
Mean,11.563
MAD,13.728
Skewness,2.0028
Sum,133230
Variance,360.18
Memory size,180.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0.0,3373,29.3%,
1.0,1694,14.7%,
2.0,916,7.9%,
3.0,557,4.8%,
70.0,549,4.8%,
5.0,365,3.2%,
11.0,325,2.8%,
63.0,310,2.7%,
9.0,286,2.5%,
7.0,271,2.4%,

Value,Count,Frequency (%),Unnamed: 3
0.0,3373,29.3%,
1.0,1694,14.7%,
2.0,916,7.9%,
3.0,557,4.8%,
4.0,269,2.3%,

Value,Count,Frequency (%),Unnamed: 3
34.0,195,1.7%,
36.0,107,0.9%,
40.0,266,2.3%,
63.0,310,2.7%,
70.0,549,4.8%,

0,1
Distinct count,4765
Unique (%),41.3%
Missing (%),1.0%
Missing (n),114

0,1
a news release,276
an interview,254
a press release,251
Other values (4761),10629

Value,Count,Frequency (%),Unnamed: 3
a news release,276,2.4%,
an interview,254,2.2%,
a press release,251,2.2%,
a speech,236,2.0%,
a TV ad,200,1.7%,
a tweet,171,1.5%,
a campaign ad,151,1.3%,
a television ad,138,1.2%,
a radio interview,118,1.0%,
a debate,102,0.9%,

0,1
Correlation,0.92129

0,1
Correlation,0.91206

0,1
Distinct count,6
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0

0,1
half-true,2362
false,2258
mostly-true,2213
Other values (3),4691

Value,Count,Frequency (%),Unnamed: 3
half-true,2362,20.5%,
false,2258,19.6%,
mostly-true,2213,19.2%,
barely-true,1891,16.4%,
true,1845,16.0%,
pants-fire,955,8.3%,

0,1
Correlation,0.98749

0,1
Distinct count,21
Unique (%),0.2%
Missing (%),0.0%
Missing (n),2
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,6.2772
Minimum,0
Maximum,105
Zeros (%),46.2%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,1
Q3,5
95-th percentile,36
Maximum,105
Range,105
Interquartile range,5

0,1
Standard deviation,16.3
Coef of variation,2.5967
Kurtosis,19.986
Mean,6.2772
MAD,8.2352
Skewness,4.3062
Sum,72326
Variance,265.68
Memory size,180.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0.0,5327,46.2%,
1.0,1670,14.5%,
3.0,593,5.1%,
9.0,578,5.0%,
2.0,529,4.6%,
8.0,416,3.6%,
7.0,416,3.6%,
61.0,310,2.7%,
5.0,290,2.5%,
4.0,257,2.2%,

Value,Count,Frequency (%),Unnamed: 3
0.0,5327,46.2%,
1.0,1670,14.5%,
2.0,529,4.6%,
3.0,593,5.1%,
4.0,257,2.2%,

Value,Count,Frequency (%),Unnamed: 3
19.0,195,1.7%,
36.0,88,0.8%,
44.0,71,0.6%,
61.0,310,2.7%,
105.0,165,1.4%,

0,1
Distinct count,25
Unique (%),0.2%
Missing (%),0.0%
Missing (n),2

0,1
republican,5094
democrat,3731
none,1967
Other values (21),730

Value,Count,Frequency (%),Unnamed: 3
republican,5094,44.2%,
democrat,3731,32.4%,
none,1967,17.1%,
organization,237,2.1%,
independent,166,1.4%,
newsmaker,63,0.5%,
libertarian,46,0.4%,
activist,43,0.4%,
journalist,40,0.3%,
columnist,37,0.3%,

0,1
Distinct count,10524
Unique (%),91.3%
Missing (%),0.0%
Missing (n),3

0,1
2008-04-16T00:00:00,7
2008-09-04T00:00:00,7
2008-09-26T00:00:00,7
Other values (10520),11500

Value,Count,Frequency (%),Unnamed: 3
2008-04-16T00:00:00,7,0.1%,
2008-09-04T00:00:00,7,0.1%,
2008-09-26T00:00:00,7,0.1%,
2008-12-03T00:00:00,7,0.1%,
2008-04-17T00:00:00,6,0.1%,
2008-09-03T00:00:00,6,0.1%,
2008-01-11T00:00:00,6,0.1%,
2007-09-10T00:00:00,6,0.1%,
2008-01-16T00:00:00,6,0.1%,
2007-11-26T00:00:00,6,0.1%,

0,1
Distinct count,3127
Unique (%),27.1%
Missing (%),0.0%
Missing (n),2

0,1
barack-obama,549
donald-trump,310
hillary-clinton,266
Other values (3123),10397

Value,Count,Frequency (%),Unnamed: 3
barack-obama,549,4.8%,
donald-trump,310,2.7%,
hillary-clinton,266,2.3%,
mitt-romney,195,1.7%,
john-mccain,168,1.5%,
scott-walker,166,1.4%,
chain-email,165,1.4%,
rick-perry,156,1.4%,
marco-rubio,139,1.2%,
rick-scott,129,1.1%,

0,1
Distinct count,1264
Unique (%),11.0%
Missing (%),0.0%
Missing (n),3

0,1
,3344
President,865
U.S. Senator,547
Other values (1260),6765

Value,Count,Frequency (%),Unnamed: 3
,3344,29.0%,
President,865,7.5%,
U.S. Senator,547,4.7%,
U.S. senator,291,2.5%,
Presidential candidate,282,2.4%,
Governor,276,2.4%,
Former governor,196,1.7%,
U.S. Representative,179,1.6%,
Senator,177,1.5%,
Milwaukee County Executive,166,1.4%,

0,1
Distinct count,970
Unique (%),8.4%
Missing (%),0.0%
Missing (n),3

0,1
,1779
Barack,551
John,450
Other values (966),8741

Value,Count,Frequency (%),Unnamed: 3
,1779,15.4%,
Barack,551,4.8%,
John,450,3.9%,
Rick,347,3.0%,
Donald,316,2.7%,
Hillary,267,2.3%,
Scott,204,1.8%,
Mitt,196,1.7%,
Bill,194,1.7%,
Chris,184,1.6%,

0,1
Distinct count,88
Unique (%),0.8%
Missing (%),0.0%
Missing (n),3

0,1
,2505
Texas,1135
Florida,1112
Other values (84),6769

Value,Count,Frequency (%),Unnamed: 3
,2505,21.7%,
Texas,1135,9.8%,
Florida,1112,9.6%,
Wisconsin,782,6.8%,
New York,740,6.4%,
Illinois,627,5.4%,
Ohio,518,4.5%,
Georgia,490,4.3%,
Virginia,461,4.0%,
Rhode Island,410,3.6%,

0,1
Distinct count,2665
Unique (%),23.1%
Missing (%),0.0%
Missing (n),3

0,1
Obama,560
Trump,310
Clinton,307
Other values (2661),10344

Value,Count,Frequency (%),Unnamed: 3
Obama,560,4.9%,
Trump,310,2.7%,
Clinton,307,2.7%,
Romney,196,1.7%,
McCain,167,1.4%,
Walker,166,1.4%,
Chain email,165,1.4%,
Perry,161,1.4%,
Scott,148,1.3%,
Rubio,139,1.2%,

0,1
Distinct count,1271
Unique (%),11.0%
Missing (%),28.1%
Missing (n),3242

0,1
President,553
U.S. Senator,540
Governor,432
Other values (1267),6757
(Missing),3242

Value,Count,Frequency (%),Unnamed: 3
President,553,4.8%,
U.S. Senator,540,4.7%,
Governor,432,3.7%,
President-Elect,310,2.7%,
U.S. senator,292,2.5%,
Presidential candidate,282,2.4%,
Former governor,195,1.7%,
U.S. Representative,190,1.6%,
Senator,176,1.5%,
Milwaukee County Executive,166,1.4%,

0,1
Distinct count,86
Unique (%),0.7%
Missing (%),21.6%
Missing (n),2487

0,1
Texas,1136
Florida,1114
Wisconsin,799
Other values (82),5988
(Missing),2487

Value,Count,Frequency (%),Unnamed: 3
Texas,1136,9.9%,
Florida,1114,9.7%,
Wisconsin,799,6.9%,
New York,744,6.5%,
Illinois,624,5.4%,
Ohio,518,4.5%,
Georgia,489,4.2%,
Virginia,461,4.0%,
Rhode Island,410,3.6%,
New Jersey,275,2.4%,

0,1
Distinct count,11502
Unique (%),99.8%
Missing (%),0.0%
Missing (n),0

0,1
On changing the rules for filibusters on presidential nominees,3
On abortion,2
Social Security is a Ponzi scheme.,2
Other values (11499),11517

Value,Count,Frequency (%),Unnamed: 3
On changing the rules for filibusters on presidential nominees,3,0.0%,
On abortion,2,0.0%,
Social Security is a Ponzi scheme.,2,0.0%,
On the Trans-Pacific Partnership.,2,0.0%,
On support for the Export-Import Bank,2,0.0%,
On high-speed rail.,2,0.0%,
"Obama says Iran is a 'tiny' country, 'doesn't pose a serious threat.'",2,0.0%,
On support for gay marriage.,2,0.0%,
Says Mitt Romney flip-flopped on abortion.,2,0.0%,
On repealing the 17th Amendment,2,0.0%,

0,1
Distinct count,2833
Unique (%),24.6%
Missing (%),0.0%
Missing (n),3
Infinite (%),0.0%
Infinite (n),0

0,1
Minimum,1995-04-01 00:00:00
Maximum,2016-12-11 00:00:00

0,1
Distinct count,11524
Unique (%),100.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,6776.6
Minimum,1
Maximum,13531
Zeros (%),0.0%

0,1
Minimum,1.0
5-th percentile,668.15
Q1,3386.8
Median,6819.5
Q3,10148.0
95-th percentile,12864.0
Maximum,13531.0
Range,13530.0
Interquartile range,6761.5

0,1
Standard deviation,3900.1
Coef of variation,0.57553
Kurtosis,-1.2022
Mean,6776.6
MAD,3381.3
Skewness,-0.0025091
Sum,78093439
Variance,15211000
Memory size,500.1 KiB

Value,Count,Frequency (%),Unnamed: 3
2047,1,0.0%,
9542,1,0.0%,
1346,1,0.0%,
7489,1,0.0%,
5440,1,0.0%,
11583,1,0.0%,
9534,1,0.0%,
3387,1,0.0%,
1338,1,0.0%,
7481,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
1,1,0.0%,
2,1,0.0%,
4,1,0.0%,
5,1,0.0%,
7,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
13525,1,0.0%,
13528,1,0.0%,
13529,1,0.0%,
13530,1,0.0%,
13531,1,0.0%,

0,1
Distinct count,4191
Unique (%),36.4%
Missing (%),0.0%
Missing (n),2

0,1
health-care,434
taxes,337
immigration,285
Other values (4187),10466

Value,Count,Frequency (%),Unnamed: 3
health-care,434,3.8%,
taxes,337,2.9%,
immigration,285,2.5%,
education,279,2.4%,
elections,273,2.4%,
candidates-biography,209,1.8%,
economy,149,1.3%,
"economy,jobs",141,1.2%,
guns,141,1.2%,
federal-budget,131,1.1%,

Unnamed: 0,statement_id,label,statement,subject,speaker,speakers_job_title,state_info,party_affiliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context,author_name_slug,ruling_date,speaker_current_job,speaker_first_name,speaker_home_state,speaker_last_name,statement_date
0,2635,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,,,,,,,NaT
1,10540,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.,sean-gorman,2015-02-23T00:00:00,State delegate,Scott,Virginia,Surovell,2015-02-04
2,324,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver,angie-drobnic-holan,2008-02-01T00:00:00,President,Barack,Illinois,Obama,2008-01-30
3,1123,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release,louis-jacobson,2009-08-07T12:13:20,,,,Bloggers,2009-08-04
4,9028,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN,amy-sherman,2014-03-11T15:57:54,,Charlie,Florida,Crist,2014-03-09


# federal election results

In [22]:
pd.options.display.max_colwidth = 300
pd.options.display.max_columns = 300

In [136]:
from itertools import product
from functools import reduce

In [131]:
def add_ending(f):
    if '2016' in f:
        return f"{f}x"
    else:
        return f
    
# TODO do 2012 it's a special snowflake
election_files = [(add_ending(f'../data/election_results/federalelections{year}.xls'), year) for year in [2014, 2016]]

In [145]:
election_results_cols_of_interest = ['CANDIDATE NAME', 'PRIMARY VOTES', 'PRIMARY %']

def fix_columns_election_results(df, year, type_):
    df = df.loc[:, election_results_cols_of_interest]
    df[f'primary_votes_{type_.lower()}_{year}'] = df['PRIMARY VOTES']
    df[f'primary_votes_{type_.lower()}_{year}_pct'] = df['PRIMARY %']
    return df.drop(columns=['PRIMARY VOTES', 'PRIMARY %'])


def get_only_voting_results(df):
    return df.loc[df['CANDIDATE NAME'].notna() & df['PRIMARY VOTES'].notna() & df['CANDIDATE NAME'].ne('Scattered') & df['CANDIDATE NAME'].ne('All Others'), :]


def prep_election_results(df, year, type_):
    return fix_columns_election_results(get_only_voting_results(df), year, type_)

In [146]:
election_results = [prep_election_results(pd.read_excel(f, sheet_name=f'{year} US {type_} Results by State'), year, type_) for (f, year), type_ in product(election_files, ['Senate', 'House'])]

election_results = reduce(lambda acc, el: pd.merge(acc, el, on='CANDIDATE NAME', how='outer'), election_results)

In [148]:
# yeah ... let's see how many we can join. the one letter endings might be a problem
election_results['CANDIDATE NAME'].value_counts()

Collins, Chris            12
Reed, Thomas W., II       12
Tonko, Paul D.            12
Maloney, Sean Patrick     12
Katko, John M.            12
Assini, Mark W.            8
Stefanik, Elise M.         8
Nadler, Jerrold L.         6
Crowley, Joseph            6
Slaughter, Louise M.       6
Engel, Eliot L.            6
Higgins, Brian             6
Lowey, Nita M.             6
Zeldin, Lee M.             6
Clarke, Yvette D.          4
Martin, Andy               4
Gregory, Du Wayne          4
Kuster, Ann McLane         4
Donovan, Dan               4
Jeffries, Hakeem S.        4
Schumer, Charles E.        4
Rice, Kathleen M.          4
Faso, John J.              4
Rubens, Jim                4
Maloney, Carolyn B.        4
Kelly, Walter W.           4
King, Peter T.             3
Long, Wendy                3
Oliva, Phil                3
Gurfein, David H.          3
                          ..
Hardin, Chris              1
Grier, Michael, Jr.        1
Hinz, Lynette "Moreno"     1
Macko, David  

In [29]:
# we are only interest in people and they have a first name
lies = lies.loc[lies['speaker_first_name'].notnull(), :]

In [30]:
# to aggregate the statements
lies['statement_year'] = lies['statement_date'].dt.year

# for the merging
lies['speaker_full_name'] = lies['speaker_last_name'] + ', ' + lies['speaker_first_name']

In [162]:
# todo expand this
# is it really houseman? probably not...
_job_titles_of_interest = [('senat', 'senator'), ('governor', None), ('congress', 'congressman'), ('mayor', None), ('president', None), ('house', 'houseman'), ('rep', 'houseman')]
job_titles_of_interest = [out if out is not None else j for j, out in _job_titles_of_interest]

def cleaned_job_title(jt):
    jt = str(jt).lower()
    
    for j, out in _job_titles_of_interest:
        if j in jt:
            return out if out is not None else j
    else:
        return jt

lies['speakers_job_title_cleaned'] = lies['speakers_job_title'].apply(cleaned_job_title)

In [163]:
_t = lies.merge(election_results, left_on='speaker_full_name', right_on='CANDIDATE NAME', how='outer')

In [164]:
print(f"found election results for {_t['CANDIDATE NAME'].notnull().sum()} ({_t['CANDIDATE NAME'].notnull().mean()}%) people")

found election results for 4952 (0.328403740301081%) people


In [159]:
useful_idx = reduce(lambda acc, el: acc | el, [_t[c].notnull() for c in _t.columns if 'votes' in c]) & _t['speaker'].notnull()

print(f"found useful results for {useful_idx.sum()} people")

columns_of_interest = ['label', 'subject', 'speaker', 'speakers_job_title_cleaned', 'state_info', 'party_affiliation', 'context', 'statement_date'] + [c for c in _t.columns if 'votes' in c]
_t.loc[useful_idx, columns_of_interest]

found useful results for 1461 people


Unnamed: 0,label,subject,speaker,speakers_job_title_cleaned,state_info,party_affiliation,context,statement_date,primary_votes_senate_2014,primary_votes_senate_2014_pct,primary_votes_house_2014,primary_votes_house_2014_pct,primary_votes_senate_2016,primary_votes_senate_2016_pct,primary_votes_house_2016,primary_votes_house_2016_pct
1115,half-true,"financial-regulation,foreign-policy,water",gwen-moore,houseman,Wisconsin,democrat,a congressional hearing,2016-04-27,,,52413,0.709127,,,,
1116,mostly-true,"crime,criminal-justice,women",gwen-moore,houseman,Wisconsin,democrat,a letter to congressional leadership,2012-12-11,,,52413,0.709127,,,,
1117,barely-true,"city-government,civil-rights,criminal-justice,public-safety,state-budget",gwen-moore,houseman,Wisconsin,democrat,comments to reporters,2015-08-26,,,52413,0.709127,,,,
1118,true,"city-government,infrastructure,transportation",gwen-moore,houseman,Wisconsin,democrat,a newspaper opinion piece,2015-01-14,,,52413,0.709127,,,,
1119,false,"children,crime,criminal-justice,sexuality",gwen-moore,houseman,Wisconsin,democrat,remarks on the House floor,2014-05-29,,,52413,0.709127,,,,
1120,pants-fire,"corrections-and-updates,health-care,women",gwen-moore,houseman,Wisconsin,democrat,tweets,2011-12-27,,,52413,0.709127,,,,
1121,true,"civil-rights,crime,criminal-justice,legal-issues",gwen-moore,houseman,Wisconsin,democrat,an interview,2016-11-18,,,52413,0.709127,,,,
1122,mostly-true,"crime,guns,terrorism,transportation",gwen-moore,houseman,Wisconsin,democrat,a tweet,2016-06-23,,,52413,0.709127,,,,
1123,barely-true,"economy,health-care,medicaid,medicare",gwen-moore,houseman,Wisconsin,democrat,a radio interview,2014-11-11,,,52413,0.709127,,,,
1124,false,"labor,state-budget,state-finances",gwen-moore,houseman,Wisconsin,democrat,a speech at a state Democratic Party dinner,2011-04-30,,,52413,0.709127,,,,


In [160]:
_t.loc[useful_idx, 'speakers_job_title_cleaned'].value_counts()

senator                                                                                                 580
milwaukee county executive                                                                              166
u.s. representative                                                                                     156
houseman                                                                                                149
nan                                                                                                     101
congressman                                                                                              69
governor                                                                                                 41
u.s. representative                                                                                      23
representative from ohio's ninth congressional district                                                  14
small business owner        

In [92]:
_t.loc[_t['speakers_job_title_cleaned'].isin(job_titles_of_interest), columns_of_interest]

Unnamed: 0,label,subject,speaker,speakers_job_title_cleaned,state_info,party_affiliation,context,statement_date,PRIMARY VOTES
1,mostly-true,foreign-policy,barack-obama,president,Illinois,democrat,Denver,2008-01-30,
2,half-true,ethics,barack-obama,president,Illinois,democrat,"a Democratic debate in Philadelphia, Pa.",2007-10-30,
3,true,federal-budget,barack-obama,president,Illinois,democrat,a radio ad,2008-08-12,
4,mostly-true,"deficit,federal-budget,health-care",barack-obama,president,Illinois,democrat,a speech at Northwestern University,2014-10-02,
5,true,abortion,barack-obama,president,Illinois,democrat,an e-mail message sent to voters before Super Tuesday.,2008-02-04,
6,true,"economy,jobs",barack-obama,president,Illinois,democrat,the State of the Union address,2015-01-20,
7,true,ethics,barack-obama,president,Illinois,democrat,an online ad.,2008-04-17,
8,barely-true,"job-accomplishments,taxes",barack-obama,president,Illinois,democrat,a speech at a campaign fund-raiser for Tom Barrett's gubernatorial campaign,2010-08-16,
9,mostly-true,"economy,federal-budget",barack-obama,president,Illinois,democrat,his State of the Union speech,2010-01-27,
10,half-true,foreign-policy,barack-obama,president,Illinois,democrat,a mailer to Pennsylvania voters,2008-04-17,


In [161]:
def label_to_nb(l): 
    return ['true', 'mostly-true', 'half-true', 'barely-true', 'false', 'pants-fire'].index(l)

df['label_as_nb'] = df['label'].apply(label_to_nb)

In [168]:
median_speaker_value = df.groupby(['statement_year', 'speaker'])['label_as_nb'].median().reset_index()

In [169]:
median_speaker_value[median_speaker_value['statement_year'] == 2016]

Unnamed: 0,statement_year,speaker,label_as_nb
4334,2016.0,18-percent-american-public,0.0
4335,2016.0,Jack_Graham,4.0
4336,2016.0,aclu-north-carolina,0.0
4337,2016.0,actionaid-k,1.5
4338,2016.0,afscme,2.0
4339,2016.0,afscme-people,1.0
4340,2016.0,al-gore,2.0
4341,2016.0,alan-grayson,1.5
4342,2016.0,alcee-hastings,5.0
4343,2016.0,alex-castellanos,3.0


## One row analysis

Let's analyse first row, statement with id `1`. What is the information we get there?

In [17]:
sid = '1'

In [18]:
df[df.statement_id == sid]

Unnamed: 0,statement_id,label,statement,subject,speaker,speakers_job_title,state_info,party_affiliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context
9403,1,pants-fire,The attorney general requires that rape victim...,"crime,women",barbara-ann-radnofsky,,,democrat,0.0,0.0,0.0,0.0,1.0,in a Web site video


In [19]:
with open(f"{directory_statements}/{sid}.json", "r") as f:
    data = json.load(f)
data

{'art': [{'brightcove': '',
   'caption': "We're Going to Need a Bigger List",
   'id': 257,
   'infogram': '',
   'ndn': '',
   'ndnid': '',
   'other': '',
   'photo': None,
   'resource_type': {'id': 2,
    'name': 'YouTube',
    'resource_uri': '/api/v/2/mediatype/2/'},
   'resource_uri': '/api/v/2/media/257/',
   'title': 'Radnofsky video',
   'wibbitz': '',
   'youtube': '<iframe src="//www.youtube.com/embed/OrKDN_FL4iM?rel=0&wmode=opaque" frameborder="0" allowfullscreen></iframe>',
   'youtubeID': 'OrKDN_FL4iM'}],
 'author': [{'email_address': 'mashford-grooms@statesman.com',
   'first_name': 'Meghan',
   'id': 118,
   'last_name': 'Ashford-Grooms',
   'name_slug': 'meghan-ashford-grooms',
   'on_staff_page': None,
   'phone': None,
   'photo': None,
   'publication': {'id': 3,
    'publication_name': 'Austin American-Statesman',
    'resource_uri': '/api/v/2/publication/3/'},
   'resource_uri': '/api/v/2/staffer/118/',
   'title': 'PolitiFact Texas staff writer',
   'twitter': 

Just to visualize JSON hierarchy, run the following cell:

In [38]:
def go_further(dic, name):
    dict_vis = {"name": name, "children": []}
    for k, v in dic.items():
        if type(v) == str:
            new_el = {"name": k}
        elif type(v) == list:
            if len(v) > 0:
                new_el = go_further(v[0], k)
        elif type(v) == dict:
            new_el = go_further(v, k)
        else:
            new_el = {"name": k}
        dict_vis["children"].append(new_el)
        
    return dict_vis

my_dict = go_further(data, name="statement_info")

with open(f"{directory_visualizations}/data.json", "w") as f:
    json.dump(my_dict, f)

print(f"Checkout visualization by: \n1) cd ../visualizations \n2) python -m http.server \n3) in browser, open: http://localhost:8000/")

Checkout visualization by: 
1) cd ../visualizations 
2) python -m http.server 
3) in browser, open: http://localhost:8000/
