## **Benford's Law Exploration by Calvin Wong**


In [None]:
#@title Please press the play button to start the program. Upload PDF only.
%pip install PyPDF2
from scipy import stats
from PyPDF2 import PdfReader
import matplotlib.pyplot as plt
from collections import defaultdict
import re
from scipy.stats import chisquare
import numpy as np
from math import log10
from google.colab import files

def extract_numbers_from_pdf(file_path):
    pdf_reader = PdfReader(file_path)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
    numbers = []
    for word in text.split():
        if not any(char.isdigit() for char in word):
            continue
        if '.' in word:
            word = re.sub('^0*\.', '.', word)
        else:
            word = word.lstrip('0')
        leading_digit = next((char for char in word if char.isdigit()), None)
        if leading_digit is not None:
            numbers.append(int(leading_digit))
    return numbers

def benford_expected():
    return [log10(1 + 1/digit) for digit in range(1, 10)]

# User uploads file
uploaded = files.upload()
file_path = list(uploaded.keys())[0]

numbers = extract_numbers_from_pdf(file_path)
numbers = [num for num in numbers if num != 0]
if numbers:
    digit_count = defaultdict(int)
    total = 0
    for number in numbers:
        digit_count[number] += 1
        total += 1

    observed_counts = [digit_count[i] for i in range(1, 10)]
    expected_proportions = benford_expected()
    expected_counts = [e * total for e in expected_proportions]
    chi_square_stat, p_val = chisquare(observed_counts, f_exp=expected_counts)

    print(f"Chi-Square Statistic: {chi_square_stat:.4f}")
    print(f"Lower Chi-Square statistics means the document is more likely to be authentic.")

    plt.figure(figsize=(6, 6))
    sorted_keys = sorted(digit_count.keys())
    bars = plt.bar(sorted_keys, [digit_count[i] / total * 100 for i in sorted_keys], tick_label=sorted_keys)

    for bar in bars:
        yval = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2, yval, f'{yval:.1f}%',
                 ha='center', va='bottom')

    plt.xlabel('Digits')
    plt.ylabel('Frequency (%)')
    plt.title('Frequency of Leading Digits')
    plt.show()
