In [19]:
from lib import utils
from lib import models
from lib import executor
from lib.utils import nearest_integer_acc

import pandas as pd
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
# process kodaira symbol if it is not already done
# path = '../data_files/sha/ecq_sha_B_100_conds_1_500000.parquet'
# df = utils.load_data(path)
# df = utils.process_kodaira_symbol(df)
# df.drop('lmfdb_label', axis=1, inplace=True)
# df.to_parquet(Path("..") / "data_files" / "sha"/ "ecq_sha_B_100_conds_1_500000_kodaira_processed.parquet")

In [20]:
# load your data here. The following ensure this will work on Windows as well as Unix
# the data file has its Kodaira symbols already preprocessed 
path = '../data_files/sha/ecq_sha_B_100_conds_1_500000_reg.parquet'
df = utils.load_data(path)

# dropping columns that are not needed
df.drop(['conductor','adelic_level','lmfdb_label'], axis=1, inplace=True)

# get square root of order of sha
df['sqrt_sha'] = df['sha'].apply(lambda x: int(x**0.5))
df.drop('sha', axis=1, inplace=True)

# get rank great than 0 curves
df = df[df['rank'] > 0]
print('Now, leaving out the rank 0 curves.')

# get how many curves are with sqrt_sha > 1
n_sha_not_1 = len(df[df['sqrt_sha'] > 1])
print(f'Now, there are in total {n_sha_not_1} ({n_sha_not_1/len(df)*100 : .2f}%) curves with sqrt_sha > 1. Values counts of the square root of order of sha: ')
print(df.sqrt_sha.value_counts())

Loaded the dataset with 120 features and 3064705 curves..
Now, leaving out the rank 0 curves.
Now, there are in total 20605 ( 1.09%) curves with sqrt_sha > 1. Values counts of the square root of order of sha: 
1    1873224
2      18710
3       1462
4        323
5         96
7          9
8          3
6          2
Name: sqrt_sha, dtype: int64


In [21]:
# to get a balanced dataset with equal amount of 1 and 4 labels
len_2 = df[df['sqrt_sha'] == 2].shape[0]
df_balanced = df[df['sqrt_sha'] == 1].sample(len_2) 
df_balanced = pd.concat([df_balanced, df[df['sqrt_sha'] == 2]])
df_balanced.sqrt_sha.value_counts()

1    18710
2    18710
Name: sqrt_sha, dtype: int64

In [8]:
# select your features 

# model BSD
# df_balanced=df_balanced[['rank', 'torsion', 'regulator', 'real_period','special_value', 'tamagawa_product','sqrt_sha']]

# model BSD (no regulator) + 
df_balanced.drop('regulator',inplace=True,axis=1)

# model BSD (no regulator)
# df_balanced=df_balanced[['rank', 'torsion','real_period','special_value', 'tamagawa_product','sqrt_sha']]

# model BSD (no regulator + rank)
# ['torsion', 'real_period', 'special_value', 'tamagawa_product','sqrt_sha']
# df_balanced=df_balanced[['torsion','real_period','special_value', 'tamagawa_product','sqrt_sha']]

# model BSD (no rank + real_period)
# ['torsion', 'real_period', 'special_value', 'tamagawa_product','sqrt_sha']
# df_balanced=df_balanced[['torsion','special_value', 'tamagawa_product','sqrt_sha']]

In [9]:
# choose model parameters
hidden_units = [128,64,32]

# default model parameters
input_dim, output_dim = utils.get_input_output_dim(df_balanced, 'sqrt_sha', if_regression=True)

# check if we have cuda available
device = utils.get_device()

# create model
model = models.VanillaNN(input_dim, hidden_units, output_dim, if_dropout=False, dropout_rate=0.5, if_batchnorm=True).to(device)

# print model summary
utils.model_summary(model)

The input dimension is 116 and the output dimension is 1.
Device: cpu.
The model has 25,793 trainable parameters..
VanillaNN(
  (layers): ModuleList(
    (0): Linear(in_features=116, out_features=128, bias=True)
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): Linear(in_features=64, out_features=32, bias=True)
    (5): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): Linear(in_features=32, out_features=1, bias=True)
  )
)


In [10]:
# choose training parameters
loss_func = nn.MSELoss()
num_epochs = 50
lr = 0.001
optimizer = optim.Adam(model.parameters(), lr=lr)
evaluator = nearest_integer_acc

In [11]:
# split data
train_dataloader, val_dataset, test_dataset = utils.prepare_data(df_balanced, 'sqrt_sha', device, if_regression=True)
# train the model
model, train_eval_hist, val_eval_hist, train_loss_hist, val_loss_hist = executor.train(model, train_dataloader, val_dataset, loss_func, evaluator, optimizer, num_epochs, if_regression=True, verbose=True)
# plot train_eval_hist, val_eval_hist
utils.plot_train_eval_hist(train_eval_hist, val_eval_hist)

Epoch 1/50. Training MSELoss : 0.0142, Validation MSELoss : 0.0145. Training nearest_integer_acc: 0.9888, Validation nearest_integer_acc: 0.9890
Epoch 2/50. Training MSELoss : 0.0149, Validation MSELoss : 0.0153. Training nearest_integer_acc: 0.9889, Validation nearest_integer_acc: 0.9891


KeyboardInterrupt: 

In [15]:
utils.plot_train_eval_hist(train_eval_hist, val_eval_hist)

NameError: name 'train_eval_hist' is not defined

In [14]:
utils.plot_train_loss_hist(train_loss_hist, val_loss_hist)

NameError: name 'train_loss_hist' is not defined

In [18]:
X_test, y_test = test_dataset.tensors
# count how many of model(X_test) is bigger than 1.5
print(f'Number of model(X_test) bigger than 1.5: {len(model(X_test)[model(X_test) > 1.5])}')
# model(X_test)

Number of model(X_test) bigger than 1.5: 536


In [58]:
acc = executor.test(model, test_dataset, evaluator, if_regression = True)
mae = executor.test(model, test_dataset, mean_absolute_error, if_regression = True)
mse = executor.test(model, test_dataset, mean_squared_error, if_regression = True)
print(f"Test accuracy: {acc:0.4f}")
print(f"Test Mean Absolute Error: {mae:0.4f}. Test Mean Squared Error: {mse:0.4f}")

Test accuracy: 0.9838
Test Mean Absolute Error: 0.0555. Test Mean Squared Error: 0.0141


model BSD: 
Test accuracy: 0.9838
Test Mean Absolute Error: 0.0555. Test Mean Squared Error: 0.0141

model BSD (no regulator) +: 
Test accuracy: 0.8373
Test Mean Absolute Error: 0.2570. Test Mean Squared Error: 0.1181

model BSD (no regulator): 
Test accuracy: 0.8370
Test Mean Absolute Error: 0.2680. Test Mean Squared Error: 0.1201

model BSD (no regulator + rank):
Test accuracy: 0.8280
Test Mean Absolute Error: 0.2713. Test Mean Squared Error: 0.1270

model BSD (no  regulator + rank + real_period):
Test accuracy: 0.7586
Test Mean Absolute Error: 0.3639. Test Mean Squared Error: 0.1715