In [1]:
#basic libraries
import os
import pandas as pd 
import numpy as np
import requests
import json
import glob
import pickle

#Visiualization and ML libraries
import sklearn
import matplotlib.pyplot as plt

from scipy.stats import norm
import statistics
import seaborn as sns

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from faKy import values_by_label, compute_statistics

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/sandrobarreshamers/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
df = pd.read_table('https://raw.githubusercontent.com/tfs4/liar_dataset/master/train.tsv')
df.columns =['json_id', 'claim', 'object', 'topic', 'speaker', 'job_title (string)','state_info (string)','party_affiliation (string)','barely_true_counts (float32)','false_counts (float32)','half_true_counts (float32)','	mostly_true_counts (float32)','	pants_on_fire_counts (float32)','context (string)']
df2 = df[['json_id', 'claim','object']].copy()

In [3]:
df_test = df2.head(10)
print(df_test)

      json_id        claim                                             object
0  10540.json    half-true  When did the decline of coal start? It started...
1    324.json  mostly-true  Hillary Clinton agrees with John McCain "by vo...
2   1123.json        false  Health care reform legislation is likely to ma...
3   9028.json    half-true  The economic turnaround started at the end of ...
4  12465.json         true  The Chicago Bears have had more starting quart...
5   2342.json  barely-true  Jim Dunnam has not lived in the district he re...
6    153.json    half-true  I'm the only person on this stage who has work...
7   5602.json    half-true  However, it took $19.5 million in Oregon Lotte...
8   9741.json  mostly-true  Says GOP primary opponents Glenn Grothman and ...
9   7115.json  mostly-true  For the first time in history, the share of th...


In [4]:
from faKy import process_text_complexity


df_test['compressed_size'] = df_test['object'].apply(process_text_complexity)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['compressed_size'] = df_test['object'].apply(process_text_complexity)


In [5]:
from faKy import process_text_readability
df_test['readability'] = df_test['object'].apply(process_text_readability)
df_test.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['readability'] = df_test['object'].apply(process_text_readability)


Unnamed: 0,json_id,claim,object,compressed_size,readability
0,10540.json,half-true,When did the decline of coal start? It started...,11444,71.815
1,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",9090,71.781579
2,1123.json,false,Health care reform legislation is likely to ma...,5512,53.655
3,9028.json,half-true,The economic turnaround started at the end of ...,4709,61.325
4,12465.json,true,The Chicago Bears have had more starting quart...,12204,54.096667


In [6]:
from faKy import process_text_vader
df_test[['vader_neg', 'vader_neu', 'vader_pos', 'vader_compound']] = df_test['object'].apply(lambda x: pd.Series(process_text_vader(x)))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[['vader_neg', 'vader_neu', 'vader_pos', 'vader_compound']] = df_test['object'].apply(lambda x: pd.Series(process_text_vader(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[['vader_neg', 'vader_neu', 'vader_pos', 'vader_compound']] = df_test['object'].apply(lambda x: pd.Series(process_text_vader(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-d

In [7]:
df_test.head()

Unnamed: 0,json_id,claim,object,compressed_size,readability,vader_neg,vader_neu,vader_pos,vader_compound
0,10540.json,half-true,When did the decline of coal start? It started...,11444,71.815,0.0,0.902,0.098,0.3612
1,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",9090,71.781579,0.107,0.687,0.206,0.3182
2,1123.json,false,Health care reform legislation is likely to ma...,5512,53.655,0.0,0.606,0.394,0.7579
3,9028.json,half-true,The economic turnaround started at the end of ...,4709,61.325,0.0,1.0,0.0,0.0
4,12465.json,true,The Chicago Bears have had more starting quart...,12204,54.096667,0.119,0.828,0.053,-0.4601


In [8]:
from faKy import count_named_entities,count_ner_labels, create_input_vector_NER, ner_labels

df_test['tot_ner_count'] = df_test['object'].apply(count_named_entities)
df_test['ner_counts'] = df_test['object'].apply(count_ner_labels)
df_test['input_vector_ner'] = df_test['ner_counts'].apply(create_input_vector_NER)

for tag in ner_labels:
    col_name = f'NER_{tag}'
    df_test[col_name] = df_test['input_vector_ner'].apply(lambda x: x[ner_labels.index(tag)] if tag in ner_labels else 0)
df_test.columns



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['tot_ner_count'] = df_test['object'].apply(count_named_entities)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['ner_counts'] = df_test['object'].apply(count_ner_labels)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['input_vector_ner'] = df_test['ner_counts'].apply(create_i

Index(['json_id', 'claim', 'object', 'compressed_size', 'readability',
       'vader_neg', 'vader_neu', 'vader_pos', 'vader_compound',
       'tot_ner_count', 'ner_counts', 'input_vector_ner', 'NER_PERSON',
       'NER_ORG', 'NER_GPE', 'NER_DATE', 'NER_NORP', 'NER_CARDINAL'],
      dtype='object')

In [9]:
df_test.head()

Unnamed: 0,json_id,claim,object,compressed_size,readability,vader_neg,vader_neu,vader_pos,vader_compound,tot_ner_count,ner_counts,input_vector_ner,NER_PERSON,NER_ORG,NER_GPE,NER_DATE,NER_NORP,NER_CARDINAL
0,10540.json,half-true,When did the decline of coal start? It started...,11444,71.815,0.0,0.902,0.098,0.3612,2,{'PERSON': 2},"[2.0, 0.0, 0.0, 0.0, 0.0, 0.0]",2.0,0.0,0.0,0.0,0.0,0.0
1,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",9090,71.781579,0.107,0.687,0.206,0.3182,4,"{'PERSON': 3, 'GPE': 1}","[3.0, 0.0, 1.0, 0.0, 0.0, 0.0]",3.0,0.0,1.0,0.0,0.0,0.0
2,1123.json,false,Health care reform legislation is likely to ma...,5512,53.655,0.0,0.606,0.394,0.7579,0,{},"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",0.0,0.0,0.0,0.0,0.0,0.0
3,9028.json,half-true,The economic turnaround started at the end of ...,4709,61.325,0.0,1.0,0.0,0.0,0,{},"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",0.0,0.0,0.0,0.0,0.0,0.0
4,12465.json,true,The Chicago Bears have had more starting quart...,12204,54.096667,0.119,0.828,0.053,-0.4601,3,"{'ORG': 1, 'DATE': 2}","[0.0, 1.0, 0.0, 2.0, 0.0, 0.0]",0.0,1.0,0.0,2.0,0.0,0.0


In [14]:
from faKy import count_pos, create_input_vector_pos, pos_tags
df_test['pos counts'] = df_test['object'].apply(count_pos)
df_test['input_vector_pos'] = df_test['pos counts'].apply(create_input_vector_pos)
for tag in pos_tags:
    col_name = f'pos_{tag}'
    df_test[col_name] = df_test['input_vector_pos'].apply(lambda x: x[pos_tags.index(tag)] if tag in pos_tags else 0)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['pos counts'] = df_test['object'].apply(count_pos)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['input_vector_pos'] = df_test['pos counts'].apply(create_input_vector_pos)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col_name] = df_test['input_vector_pos'].apply(lambda x:

In [15]:
df_test.columns

Index(['json_id', 'claim', 'object', 'compressed_size', 'readability',
       'vader_neg', 'vader_neu', 'vader_pos', 'vader_compound',
       'tot_ner_count', 'ner_counts', 'input_vector_ner', 'NER_PERSON',
       'NER_ORG', 'NER_GPE', 'NER_DATE', 'NER_NORP', 'NER_CARDINAL',
       'pos counts', 'input_vector_pos', 'pos_ADJ', 'pos_ADP', 'pos_ADV',
       'pos_AUX', 'pos_CCONJ', 'pos_DET', 'pos_INTJ', 'pos_NOUN', 'pos_NUM',
       'pos_PART', 'pos_PRON', 'pos_PROPN', 'pos_PUNCT', 'pos_SCONJ',
       'pos_SYM', 'pos_VERB', 'pos_X'],
      dtype='object')

In [16]:
df_test.head()

Unnamed: 0,json_id,claim,object,compressed_size,readability,vader_neg,vader_neu,vader_pos,vader_compound,tot_ner_count,...,pos_NOUN,pos_NUM,pos_PART,pos_PRON,pos_PROPN,pos_PUNCT,pos_SCONJ,pos_SYM,pos_VERB,pos_X
0,10540.json,half-true,When did the decline of coal start? It started...,11444,71.815,0.0,0.902,0.098,0.3612,2,...,5.0,0.0,1.0,1.0,4.0,4.0,0.0,0.0,4.0,0.0
1,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",9090,71.781579,0.107,0.687,0.206,0.3182,4,...,2.0,0.0,1.0,0.0,7.0,3.0,0.0,0.0,3.0,0.0
2,1123.json,false,Health care reform legislation is likely to ma...,5512,53.655,0.0,0.606,0.394,0.7579,0,...,7.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,9028.json,half-true,The economic turnaround started at the end of ...,4709,61.325,0.0,1.0,0.0,0.0,0,...,3.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,12465.json,true,The Chicago Bears have had more starting quart...,12204,54.096667,0.119,0.828,0.053,-0.4601,3,...,5.0,2.0,0.0,0.0,4.0,3.0,1.0,0.0,3.0,0.0
