In [None]:
#This code is used to emperically verify the analytic number theory conjecture
#This code uses statistical properties and thus succeeds to achieve high accuracy. Accuracy here: 1.0

In [None]:
import os
import math
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers  import Dense, BatchNormalization, Activation
from sklearn.metrics  import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.callbacks  import EarlyStopping
from tensorflow.keras.models   import Sequential

In [None]:
train_file = '/content/drive/MyDrive/Dirichlet Mod Conh/zeros_train.csv' #change this to use
test_file  = '/content/drive/MyDrive/Dirichlet Mod Conh/zeros_test.csv' #change this to use
nb_of_zeros=25
cols = ['y'] + [f'zero_{i+1}' for i in range(nb_of_zeros)] # 25 is the number of zeros per line

In [None]:
def load_data(t_path):
    t_data = []
    with open(t_path) as f:
        for line in f:
            label, zeros = line.split(':')
            y = int(label)
            zeros = [float(zero) for zero in zeros.split(',') if zero.strip()]
            t_data.append([y] + zeros)
    return pd.DataFrame(t_data, columns=cols)

train  = load_data(train_file)
test   = load_data(test_file)

In [None]:
def construct_zero_features(df):  
    rows = [] 
    for i, r in df.iterrows():  
        zeros25= r[1:].values  # the 25 zeros

        differences = np.diff(zeros25)  
        # average, variance, average spacing, spacing variance, second-difference mean, and average squared spacing
        stats = [ np.mean(zeros25), np.var(zeros25), np.mean(differences), np.mean(differences), np.var(differences), np.mean(np.diff(differences)), np.mean(differences**2) ]

        # Mean absolute difference over every pair of non-zeros 
        mean_absolute_difference = np.mean(np.abs(np.subtract.outer(zeros25, zeros25))) 

        # the matrix of zeros convolved with an averaging filter of size 3x3 (i.e filter= 1/3 ones(3x3))
        filter_averaging= np.convolve(zeros25, np.ones(3) / 3, mode='valid') 
        mean_filer_averaging = np.mean(filter_averaging) 

        # append engineered features
        rows.append(list(zeros25) + stats + [mean_absolute_difference, mean_filer_averaging]) 

    features = list(df.columns[1:]) + [
        'mean_zero', 'var_zero', 'skew_zero', 'mean_diff', 'var_diff',
        'skew_diff', 'kurt_diff', 
        'mean_pairwise_diff', 'mean_moving_avg'
    ]
    output = pd.DataFrame(rows, columns=features)  
    output['y'] = df['y'].values  
    return output


In [None]:
#to ensure result is independent of how the labels are ordered in datasets we shuffle them

df_train = construct_zero_features(train)
df_test  = construct_zero_features(test)

df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True)
df_test  = df_test.sample(frac=1, random_state=42).reset_index(drop=True)

y_train = df_train.pop('y').values
X_train = df_train.values
y_test  = df_test.pop('y').values
X_test  = df_test.values

In [None]:
#standardize and ecoding
scaler   = StandardScaler()
X_train_scaler = scaler.fit_transform(X_train)
X_test_scaler  = scaler.transform(X_test)

label_ecode = LabelEncoder()
y_train_encode = label_ecode.fit_transform(y_train)
y_test_encode  = label_ecode.transform(y_test)

In [None]:
random_forest_model = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')
random_forest_model.fit(X_train_scaler, y_train_encode)


In [None]:
random_forest_test_pred  = random_forest_model.predict_proba(X_test_scaler)

random_forest_test_pred_labels = label_ecode.inverse_transform(random_forest_test_pred)
accuracy = accuracy_score(y_test, random_forest_test_pred_labels)

In [None]:
print(f"Test accuracy: {accuracy:.4f}")
print("y_test")
print(y_test)
print("y_pred")
print(random_forest_test_pred_labels)