# Numerical Fact Checking System. Univeristy of Sheffield

Pre-requisites:
 * Gradle
 * Java jdk8
 * Python 3
  * numpy
  * jnius
  * fuzzywuzzy
  * sklearn
  * urllib3
 

## Configuration
This defines the colleciton of tables that is used to populate the knowledge base

In [1]:
world = "herox"

## Common setup

Import required dependencies and download/install Stanford CoreNLP

In [2]:
import sys
import os
import re

#Set path manually to incldue sources location
if 'src/' not in sys.path:
    sys.path.append('src/')

If the following step fails. Run `gradlew writeClasspath` on the terminal in this folder. Then try again

In [3]:
#Load Java classpath for stanford corenlp using gradle. this will also install it if missing
from subprocess import run,PIPE
if 'CLASSPATH' not in os.environ:
    if not (os.path.exists('build') and os.path.exists('build/classpath.txt')):
        print("Generating classpath")
        r=run(["./gradlew", "writeClasspath"],stdout=PIPE, stderr=PIPE, universal_newlines=True)
        print(r.stdout)
        print(r.stderr)
              
    print("Loading classpath")
    os.environ['CLASSPATH'] = open('build/classpath.txt','r').read()
    print("Done")

Loading classpath
Done


## Fact Checking

### Training
Load Modules for fact checking, generate the features and train our classifier from our training data

In [4]:
from classifier.Classifier import Classifier
from classifier.LogisticRegressionClassifier import LogisticRegressionClassifier
from classifier.features.generate_features import FeatureGenerator, num, is_num
from distant_supervision.utterance_detection import f_threshold_match
from factchecking.question import Question
from tabular.filtering import load_collection



In [5]:
fg = FeatureGenerator()
Xs,ys = fg.generate_training(world)

Done: 0.0
Search for ”Exxon Mobil" Market Value
Query already executed
Done: 6.25
Search for "Unaccompanied children" claimed asylum
Query already executed
Done: 12.5
Search for "Hamas" Founded
Query already executed
Done: 18.75
Search for "United States" Average Temperature
Query already executed
Done: 25.0
Search for "United States" Life expectancy
Query already executed
Done: 31.25
Search for "United States" Number of abortions
Query already executed
Done: 37.5
Search for "United States" Abortion Rate per 1,000 births
Query already executed
Done: 43.75
Search for "United States Teenagers" Percentage Enrolled in education
Query already executed
Done: 50.0
Search for "United States Teenagers" Enrolled in education
Query already executed
Done: 56.25
Search for "America" bee colonies 2011
Query already executed
Done: 62.5
Search for "United States" Financial Intermediary Funds 2016
Query already executed
Done: 68.75
Search for "United States" Homocides by firearm
Query already executed


In [6]:
classifier = LogisticRegressionClassifier()
classifier.train(Xs,ys)

Training classifier 3
Trained


### Runtime

Load the source data

In [7]:
tables = load_collection(world)
print(tables.files)

LOADED:
[{'answer': '0', 'utterance': 'No Utterance', 'table': 'herox/1.tsv', 'id': 'hx-0'}, {'answer': '0', 'utterance': 'No Utterance', 'table': 'herox/2.tsv', 'id': 'hx-1'}, {'answer': '0', 'utterance': 'No Utterance', 'table': 'herox/3.tsv', 'id': 'hx-2'}, {'answer': '0', 'utterance': 'No Utterance', 'table': 'herox/4.tsv', 'id': 'hx-3'}, {'answer': '0', 'utterance': 'No Utterance', 'table': 'herox/5.tsv', 'id': 'hx-4'}, {'answer': '0', 'utterance': 'No Utterance', 'table': 'herox/8.tsv', 'id': 'hx-5'}, {'answer': '0', 'utterance': 'No Utterance', 'table': 'herox/9.tsv', 'id': 'hx-6'}, {'answer': '0', 'utterance': 'No Utterance', 'table': 'herox/10.tsv', 'id': 'hx-7'}, {'answer': '0', 'utterance': 'N', 'table': 'herox/11.tsv', 'id': 'hx-8'}, {'answer': '0', 'utterance': 'N', 'table': 'herox/12.tsv', 'id': 'hx-9'}, {'answer': '0', 'utterance': 'N', 'table': 'herox/13.tsv', 'id': 'hx-10'}, {'answer': '0', 'utterance': 'N', 'table': 'herox/14.tsv', 'id': 'hx-11'}]
register table herox

Define the fact checking function

In [8]:
def fact_check(q):
    question = Question(text=q, type="NUM")
    question.parse()
    tuples,q_features = fg.generate_test(tables,question)
    q_match = False
    
  
    
    if len(tuples)>0:
  
    
        
        
        q_predicted = classifier.predict(q_features)

        for i in range(len(tuples)):
            tuple = tuples[i]
            
            skip = False
            if 'date' in tuple[1].keys() and len(question.dates):
                for date in question.dates:
                    dstrs = set()
                    for d in question.dates:
                        dstrs.add(str(d))
                    if not len(set(tuple[1]['date']).intersection(dstrs)):
                        skip = True
                        
            if skip:
                continue
    

            if is_num(tuple[1]['value']):
                prediction = q_predicted[0][i]
                features = q_features[i]

                
             
                if prediction == 1:
                    print(str(tuple) + "\t\t" + ("Possible Match" if prediction else "No match"))
                    for number in question.numbers:
                        value = num(tuple[1]['value'])

                        if value is None:
                            continue

                        if f_threshold_match(number, value, 0.05):
                            print(str(tuple) + "\t\t" + "Threshold Match to 5%")
                            q_match = True

                    for number in question.dates:
                        value = num(tuple[1]['value'])
                        if number == value:
                            print(str(tuple) + "\t\t" + "Exact Match")
                            q_match = True
        print(question.text)
        print(q_match)

    else:
        print(question.text)
        print("No supporting information can be found in the knowledge base")
    print("\n\n")

# Fact checking

In [9]:
fact_check("Around 22250 unaccompanied children claimed asylum in Germany in 2015")
fact_check("Around 4500 unaccompanied children claimed asylum in Germany in 2014")
fact_check("There were 4500 immigrants in Germany in 2014")


fact_check("97% of children in America were vaccinated against measles in 2014.")
fact_check("12.9% of the total population of the USA were daily smokers in 2014")
fact_check("In the USA in 2010, the number of homicides by firearm was almost 10,000.")
fact_check("In the USA in 2010, the number of homicides by firearm was almost 11,000.")
fact_check("In 2012 there were 3,282,570 bee colonies in America. ")

('herox/2.tsv', {'relation': 'Asylum applicants considered to be unaccompanied minors', 'value': '22255', 'entity': 'Germany', 'date': ['2015']})		Possible Match
('herox/2.tsv', {'relation': 'Asylum applicants considered to be unaccompanied minors', 'value': '22255', 'entity': 'Germany', 'date': ['2015']})		Threshold Match to 5%
('herox/2.tsv', {'relation': 'Asylum applicants considered to be unaccompanied minors', 'value': '22255', 'entity': 'Germany', 'date': ['2015']})		Possible Match
('herox/2.tsv', {'relation': 'Asylum applicants considered to be unaccompanied minors', 'value': '22255', 'entity': 'Germany', 'date': ['2015']})		Threshold Match to 5%
('herox/2.tsv', {'relation': 'Asylum applicants considered to be unaccompanied minors', 'value': '22255', 'entity': 'Germany', 'date': ['2015']})		Possible Match
('herox/2.tsv', {'relation': 'Asylum applicants considered to be unaccompanied minors', 'value': '22255', 'entity': 'Germany', 'date': ['2015']})		Threshold Match to 5%
('herox