In [3]:
from lib import utils, models, executor
import torch.nn as nn
import torch.optim as optim
from pathlib import Path
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error

## Create the balanced dataset

In [4]:
# load your data here. The following ensure this will work on Windows as well as Unix

path = Path("..") / "data_files" / "sha"/ "ecq_sha_B_100_conds_1_500000_reg.parquet"
df = utils.load_data(path)

Loaded the dataset with 120 features and 3064705 curves..


In [5]:
# The first experiment will be to take a balanced 4/9 dataset with all the BSD features, and only the BSD features (i.e. no ap vals). This is a sanity check

len_9 = df[df['sha'] == 9].shape[0]
df_balanced = df[df['sha'] == 4].sample(len_9) 
df_balanced = pd.concat([df_balanced, df[df['sha'] == 9]])
df_balanced.sha.value_counts()

sha
4    50428
9    50428
Name: count, dtype: int64

In [6]:
bsd_features = ['special_value', 'torsion', 'real_period', 'regulator', 'tamagawa_product', 'sha']

df_balanced_bsd = df_balanced[bsd_features].copy()

In [7]:
df_balanced_bsd.head(5)

Unnamed: 0,special_value,torsion,real_period,regulator,tamagawa_product,sha
1648936,4.77592,2,0.14925,1.0,32,4
262328,6.08918,1,0.16914,1.0,9,4
1667600,2.41156,2,0.30145,1.0,8,4
2134251,3.07913,2,0.38489,1.0,8,4
1054165,1.35681,2,0.1696,1.0,8,4


### The following cell runs logistic regression on the data as-is, and yields about 64% accuracy.

In [9]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Splitting features and target
X = df_balanced_bsd[['special_value', 'torsion', 'real_period', 'regulator', 'tamagawa_product']]
y = df_balanced_bsd['sha']

# 80/20 train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)

# Display the accuracy score
print("Accuracy:", accuracy)

Accuracy: 0.6395994447749356


### Do the same as above, but this time take the logarithm of the data

In [8]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Log transform the dataframe
df_log_transformed = df_balanced_bsd.apply(np.log)

# Convert the log-transformed 'sha' column back to categorical labels
df_log_transformed['sha'] = df_balanced_bsd['sha']

# Splitting features and target
X = df_log_transformed[['special_value', 'torsion', 'real_period', 'regulator', 'tamagawa_product']]
y = df_log_transformed['sha']

# 80/20 train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)

# Display the accuracy score
print("Accuracy:", accuracy)

Accuracy: 1.0


### Run a linear regression model and give the relationship it thinks exists between the BSD features. It yields the BSD formula. As written in the paper, this is not evidence for BSD.

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Log transform the dataframe
df_log_transformed = df_balanced_bsd.apply(np.log)

# Splitting features and target
X = df_log_transformed[['special_value', 'torsion', 'real_period', 'regulator', 'tamagawa_product']]
y = df_log_transformed['sha']

# 80/20 train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)

# Display the mean squared error
print("Mean Squared Error:", mse)

# Display the linear regression equation coefficients
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)

# Display the equation
equation = "y = {:.4f} + ".format(model.intercept_)
equation += " + ".join(["{:.4f} * {}".format(coef, feature) for coef, feature in zip(model.coef_, X.columns)])
print("Linear Regression Equation:", equation)

Mean Squared Error: 9.191485945949486e-09
Intercept: 5.026547875264242e-06
Coefficients: [ 0.99999986  2.00000005 -0.99999814 -0.99999949 -1.00000108]
Linear Regression Equation: y = 0.0000 + 1.0000 * special_value + 2.0000 * torsion + -1.0000 * real_period + -1.0000 * regulator + -1.0000 * tamagawa_product


### Run a histogram-based gradient boosting machine on the data

In [15]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

# Splitting features and target
X = df_balanced_bsd[['special_value', 'torsion', 'real_period', 'regulator', 'tamagawa_product']]
y = df_balanced_bsd['sha']

# 80/20 train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Histogram-based Gradient Boosting classifier
model = HistGradientBoostingClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

Accuracy: 0.9828970850684117
              precision    recall  f1-score   support

           4       0.98      0.99      0.98     10173
           9       0.99      0.98      0.98      9999

    accuracy                           0.98     20172
   macro avg       0.98      0.98      0.98     20172
weighted avg       0.98      0.98      0.98     20172



### As mentioned in the paper, taking the log of the data makes no difference

In [16]:
# Log transform the dataframe
df_log_transformed = df_balanced_bsd.apply(np.log)

# Convert the log-transformed 'sha' column back to categorical labels
df_log_transformed['sha'] = df_balanced_bsd['sha']

# Splitting features and target
X = df_log_transformed[['special_value', 'torsion', 'real_period', 'regulator', 'tamagawa_product']]
y = df_log_transformed['sha']

# 80/20 train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Histogram-based Gradient Boosting classifier
model = HistGradientBoostingClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

Accuracy: 0.9828970850684117
              precision    recall  f1-score   support

           4       0.98      0.99      0.98     10173
           9       0.99      0.98      0.98      9999

    accuracy                           0.98     20172
   macro avg       0.98      0.98      0.98     20172
weighted avg       0.98      0.98      0.98     20172

