# Benfords Law Analysis
## Overview 
This notebook contains code for fitting Benford's Law to data.

In [13]:
import matplotlib.pyplot as plt
import numpy as np
from qiime2 import Artifact
from os import listdir
from scipy.stats import chisquare
import pandas as pd

def get_first_digits(data):
    """return first digits from the data to test Benford's law
    data -- a list of positive integers(zeros and negative numbers will be skipped)
    
    """
    
    data = [x for x in data if pd.notna(x) and x > 0]
    first_digits = [int(str(x)[0]) for x in data]
    return first_digits


def frequency_fdigits (first_digits):
    """return frequencies of the first digits
    first_digits -- list of first digits extracted from data
    
    """
    
    frequency_results = {}
    for first_digit in first_digits:
        count_fdigit = first_digits.count(first_digit)
        length_of_list = len(first_digits)
        frequency = count_fdigit/length_of_list
        frequency_results[first_digit] = frequency
        return frequency_results

def benfords_law():
    """ expected probabilities for each digits based on Benford's law
        digits -- from 1-9
    """
    
    return {digit: np.log10(1 + 1/digit) for digit in range(1, 10)}

def benfords_law_test(data):
    """ return chi square and p value for the fit of the data to Benford's law
        data-- list of numbers to test the Benford's law
    """
   
    # Get the first digits
    first_digits = get_first_digits(data)
    
    # Calculate observed frequencies of first digits
    observed_freqs = np.bincount(first_digits, minlength=10)[1:10] # Skip index 0
    
    # Calculate expected frequencies according to Benford's Law
    total_count = len(first_digits)
    expected_freqs = benfords_law()
    expected_freqs = [expected_freqs[n] for n in range(1,10)]
    
    # Normalize observed frequencies for comparison
    observed_freqs_normalized = observed_freqs / total_count
    
    # Perform chi-square test
    chi2_stat, p_value = chisquare(observed_freqs_normalized, expected_freqs)
    
    return chi2_stat, p_value

def benfords_law_plot(observed_freqs, expected_freqs, output_filepath):
    """ return plots of observed and expected frequencies
        observed_freqs -- a list of observed frequencies (0.0 - 1.0) of digits 1-9
        expected_freqs -- a list of expected frequencies (0.0 - 1.0) of digits 1-9
        output_filepath -- the location you want to save your graph image
    """

# Plot the observed vs expected distribution
    plt.bar(range(1, 10), observed_freq_normalized, alpha=0.6, color='blue', label='Observed')
    plt.plot(range(1, 10), [e / total_count for e in expected_freq], 'ro-', label='Expected (Benford)')
    plt.xlabel('First Digit')
    plt.ylabel('Frequency')
    plt.title('Benford\'s Law Test')
    plt.savefig(output_filepath)
    
# Load your data
print("about to load the feature table")
feature_table = Artifact.load("../../Neutral Model Analysis/input/carib_silva_merged_table.qza")
from qiime2.plugins.feature_table.methods import filter_features_conditionally
filtered_feature_table_results = filter_features_conditionally(table = feature_table, abundance = 0.01, prevalence = 1/50)
filtered_feature_table = filtered_feature_table_results.filtered_table
df = filtered_feature_table.view(pd.DataFrame)

#print("about to transpose the feature table")
#df = df.T  # Transpose if required by your analysis

# Loop through each column and apply Benford's Law test
results = {}
for column in df.columns:
    print(f"Processing column: {column}")
    column_data = df[column].dropna().tolist()
    chi2_stat, p_value = benfords_law_test(column_data)
    print("p value:", p_value)
    if p_value is not None:
        results[column] = {"chi2_stat": chi2_stat, "p_value": p_value}

# Display summary of results
summary_df = pd.DataFrame(results).T
print(summary_df)
summary_df.to_csv("../output/benfords_law_results.csv")

about to load the feature table
Processing column: 04f5517358e3f1bb2ee4c31a2d45e25f
p value: 0.9999998948912167
Processing column: 09d21bfe1332afd48aea1cf7bbcc4dee
p value: 0.9999964546942847
Processing column: 0c2c897e4ce2af1fd6246a8a593786e1
p value: 0.9999803736312717
Processing column: 0dea46ce5d22dfefc8434eb650916a76
p value: 0.9999987566714421
Processing column: 0ee6dc6715e3c01f0069f37cc08da68e
p value: 0.9999960263204508
Processing column: 10b086b0d825c9822544baf83bae8040
p value: 0.9999975608206465
Processing column: 11b0b5582c062db30ab8cec43bf54504
p value: 0.9999946918361037
Processing column: 135a4a18ee628f44016c98b06a03a044
p value: 0.9999985677063057
Processing column: 14523ac3c021df0a29d7b3c8b07f5be1
p value: 0.9999777401538159
Processing column: 150821ae14d96d99391fff76daca07d6
p value: 0.9998459973725309
Processing column: 1601d4b40fdc9a2425400dfbc18cc50e
p value: 0.9999221547443504
Processing column: 176295281d7619725d2b16ed19319bb1
p value: 0.9999937206885536
Processi