# Goal: train a neural network model of good score to predict size of sha
$$Recall = \frac{True Positive(TP)}{Actually Positive(TP + FN)}$$

In [1]:
import utils
import models
import executor
import torch.nn as nn
import torch.optim as optim
# import recall function from sklearn
from sklearn.metrics import recall_score, matthews_corrcoef
from pathlib import Path
import pandas as pd

In [2]:
# load your data here. The following ensure this will work on Windows as well as Unix
path = Path("..") / "data_files" / "ecq_sha_B_50_conds_1_8191.parquet"
df = utils.load_data(path)

Loaded the big dataset with 60 a_p's and 52710 curves..


In [3]:
def normalise_kodaira_symbol(ks_list):
    output = []
    for ks in ks_list:
        if ks >= 5:
            output.append(5)
        elif ks <= -5:
            output.append(-5)
        else:
            output.append(ks)
    return output

df['kodaira_symbols'] = df['kodaira_symbols'].apply(normalise_kodaira_symbol).apply(set).apply(list)

In [5]:
# Step 1: Use apply with a lambda function to check if 1 is in the list
contains_1 = df['kodaira_symbols'].apply(lambda x: 1 in x)

# Step 2: Use this boolean Series to index df
df_1 = df[contains_1]

In [6]:
df_1

Unnamed: 0,lmfdb_label,2,3,5,7,11,13,17,19,23,...,rank,torsion,adelic_level,adelic_index,adelic_genus,sha,kodaira_symbols,real_period,special_value,tamagawa_product


In [4]:
# Step 1: Split the lists into separate rows
df_split = df['kodaira_symbols'].apply(pd.Series)

# Step 2: Stack the DataFrame to get a Series with a MultiIndex
df_split = df_split.stack()

# Step 3: Perform one-hot encoding
df_dummies = pd.get_dummies(df_split, prefix='kodaira')

# Step 4: Sum the DataFrame level-wise
df_dummies = df_dummies.sum(level=0)

# Step 5: Join the original DataFrame with the one-hot encoded DataFrame
df = df.join(df_dummies)

  df_dummies = df_dummies.sum(level=0)


In [5]:
df.drop('kodaira_symbols', axis=1, inplace=True)
df.drop('lmfdb_label', axis=1, inplace=True)

In [6]:
df['sha'].value_counts()

1     50676
4      1407
9       409
16      133
25       56
49       15
64        6
36        5
81        3
Name: sha, dtype: int64

In [7]:
# choose model parameters
hidden_units = [20]

# default model parameters
input_dim, output_dim = utils.get_input_output_dim(df, 'sha')

# check if we have cuda available
device = utils.get_device()

# create model
model = models.VanillaNN(input_dim, hidden_units, output_dim).to(device)

# print model summary
utils.model_summary(model)

The input dimension is 68 and the output dimension is 9.
Device: cpu.
The model has 1,569 trainable parameters..
VanillaNN(
  (layers): ModuleList(
    (0): Linear(in_features=68, out_features=20, bias=True)
    (1): Linear(in_features=20, out_features=9, bias=True)
  )
)


In [9]:
# choose training parameters
loss_func = nn.CrossEntropyLoss()
num_epochs = 10
lr = 0.001
import sklearn
evaluator = sklearn.metrics.matthews_corrcoef
optimizer = optim.Adam(model.parameters(), lr=lr)

In [10]:
# split data
train_dataloader, val_dataset, test_dataset = utils.prepare_data(df,device)
# train the model
model, train_eval_hist, val_eval_hist = executor.train(model, train_dataloader, val_dataset, loss_func, evaluator, optimizer, num_epochs,verbose=False)
# plot train_eval_hist, val_eval_hist
utils.plot_train_eval_hist(train_eval_hist, val_eval_hist)

In [10]:
# test the model
test_res = executor.test(model, test_dataset, evaluator, verbose=True)

Test recall_score: 0.7733068641673666


In [14]:
# load your data here. The following ensure this will work on Windows as well as Unix
path = Path("..") / "data_files" / "CustomDataset" / "custom_dataset.parquet"
df_custom_full = utils.load_data(path)
df_custom_full.drop(columns=['conductor'],inplace=True)

# convert the rank to binary classification
df_custom_full = utils.convert_rank_to_binary(df_custom_full, threshold)

X = df_custom_full.drop(columns=['rank']).values
y = df_custom_full['rank'].values

import torch
from torch.utils.data import TensorDataset
X_tensor = torch.tensor(X, dtype=torch.float32).to(device)
y_tensor = torch.tensor(y, dtype=torch.long).to(device)

custom_full = TensorDataset(X_tensor, y_tensor)

Loaded the big dataset with 1000 a_p's and 2076146 curves..
Converted the rank column to binary. The value of 1 means the rank is greater than 4, otherwise 0. Rank counts:
     count
0  1360505
1   715641


In [23]:
test_res = executor.test(model, custom_full, evaluator, verbose=True)

Test recall_score: 0.7813466644918304
