This notebook is for HGBM on curves with positive rank and size of Sha equal to 1 and 4 and removing one BSD feature at a time. This includes both the original and log-transformed data.

In [1]:
from lib import utils, models, executor
import torch.nn as nn
import torch.optim as optim
from pathlib import Path
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error

# fix the seed for reproducibility
seed = 42

# 1. Create balanced dataset of elliptic curves with positive rank and size of the Tate-Shafarevich group equal to 1 and 4 containing all BSD features

In [2]:
# load your data here. The following ensure this will work on Windows as well as Unix
path = Path("..") / "data_files" / "sha"/ "ecq_sha_B_100_conds_1_500000_reg.parquet"
df = utils.load_data(path)
df=df[df['rank']>0]

Loaded the dataset with 120 features and 3064705 curves..


In [3]:
len_4 = df[df['sha'] == 4].shape[0]
df_balanced = df[df['sha'] == 1].sample(len_4, random_state=seed) 
df_balanced = pd.concat([df_balanced, df[df['sha'] == 4]])
df_balanced.sha.value_counts()

sha
1    18710
4    18710
Name: count, dtype: int64

In [4]:
#Get columns with all the BSD features, from which we will eventually remove one at a time
bsd_features = ['special_value', 'torsion', 'real_period', 'regulator', 'tamagawa_product', 'sha']

df_balanced_bsd = df_balanced[bsd_features].copy()

In [5]:
df_balanced_bsd.head(5)

Unnamed: 0,special_value,torsion,real_period,regulator,tamagawa_product,sha
393967,7.0873,4,0.87493,8.10043,16,1
811492,5.28675,2,0.31006,1.70505,40,1
872987,9.43579,1,1.05987,2.22569,4,1
761927,2.52192,2,0.54553,0.57786,32,1
2647019,5.41466,1,0.3187,4.2474,4,1


# 2. Delete one feature at a time on original data
The best accuracy is about 92% when removing the special value.

In [6]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor

In [7]:
# Initialize an empty DataFrame to store the results
results_df_hist_grad = pd.DataFrame({
    'Feature Deleted': pd.Series(dtype='str'),
    'Accuracy': pd.Series(dtype='float')})


for i in range(len(bsd_features[:-1])):
    print(f'Running model without {bsd_features[i]}..')
    df_sub = df_balanced_bsd.drop(columns=[bsd_features[i]]).copy()
    X = df_sub[[c for c in df_sub.columns if c != 'sha']]
    y = df_sub['sha']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
    model = HistGradientBoostingClassifier(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
   # Append the results to the DataFrame
    results_df_hist_grad = pd.concat([results_df_hist_grad, pd.DataFrame([{'Feature Deleted': bsd_features[i], 'Accuracy': accuracy}])], ignore_index=True)
    
print(results_df_hist_grad)

Running model without special_value..


Running model without torsion..


Running model without real_period..


Running model without regulator..


Running model without tamagawa_product..


    Feature Deleted  Accuracy
0     special_value  0.920230
1           torsion  0.811865
2       real_period  0.781935
3         regulator  0.831106
4  tamagawa_product  0.766569


# 3. Delete one feature at a time on log-transformed data
Again, note that the accuracy scores are identical to those of the original data.

In [8]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor

In [9]:
# Initialize an empty DataFrame to store the results
results_df_hist_grad_log = pd.DataFrame({
    'Feature Deleted': pd.Series(dtype='str'),
    'Accuracy': pd.Series(dtype='float')})


for i in range(len(bsd_features[:-1])):
    print(f'Running model without {bsd_features[i]}..')
    df_sub_log = df_balanced_bsd.drop(columns=[bsd_features[i]]).copy()
    print
    X = df_sub_log[[c for c in df_sub_log.columns if c != 'sha']].apply(np.log)
    y = df_sub_log['sha']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
    model = HistGradientBoostingClassifier(random_state=seed)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
   # Append the results to the DataFrame
    results_df_hist_grad_log = pd.concat([results_df_hist_grad_log, pd.DataFrame([{'Feature Deleted': bsd_features[i], 'Accuracy': accuracy}])], ignore_index=True)
    
print(results_df_hist_grad_log)

Running model without special_value..


Running model without torsion..


Running model without real_period..


Running model without regulator..


Running model without tamagawa_product..


    Feature Deleted  Accuracy
0     special_value  0.920230
1           torsion  0.811865
2       real_period  0.781935
3         regulator  0.831106
4  tamagawa_product  0.766569
