In [16]:
import pandas as pd
import sympy as sp
import re

def load_file(file_path: str) -> pd.DataFrame:
    data = open(file_path, "r").readlines()
    data = [line.strip().split("=") for line in data]

    # Extract function, variable, and derivative from each line
    rows = [(line[0].split('d(')[-1].split(')/d')[0], line[0].split(')/d')[1], line[1]) for line in data]

    # Create a DataFrame
    df = pd.DataFrame(rows, columns=['Function', 'Variable', 'Derivative'])

    return df

def tokenize(expression):
    # Use regular expressions to split the expression into tokens
    tokens = re.findall(r'\d+\.?\d*|\w+|[+\-*/()^]', expression)
    return tokens

def tokenize_dataframe(df):
    # Create a set of all unique tokens in the DataFrame
    all_tokens = set()

    for col in ['Function', 'Variable', 'Derivative']:
        all_tokens.update(df[col].apply(tokenize).explode().unique())

    # Define a dictionary to map tokens to unique numerical values
    token_to_index = {token: i for i, token in enumerate(all_tokens)}
    index_to_token = {i: token for token, i in token_to_index.items()}

    # Convert tokens to numerical values
    for col in ['Function', 'Variable', 'Derivative']:
        df[f'{col}_indices'] = df[col].apply(lambda expr: [token_to_index[token] for token in tokenize(expr)])

    return df, token_to_index, index_to_token

# Example usage
filepath = "train.txt"
df = load_file(filepath)

# Tokenize expressions, variables, and derivatives in the DataFrame
df, token_to_index, index_to_token = tokenize_dataframe(df)

In [17]:
df = df.drop(['Function', 'Variable', 'Derivative'], axis = 1)

# Assuming df is your DataFrame with the original column names
df.rename(columns={'Function_indices': 'Function', 'Variable_indices': 'Variable', 'Derivative_indices': 'Derivative'}, inplace=True)

In [22]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Masking
from sklearn.model_selection import train_test_split

# Assuming num_tokens is the total number of unique tokens
num_tokens = len(index_to_token)
max_sequence_length = 30  # Maximum sequence length

# Assuming df is your DataFrame with columns 'Function', 'Variable', and 'Derivative'
function_indices = np.array(df['Function'].tolist(), dtype=object)
variable_indices = np.array(df['Variable'].tolist(), dtype=object)
derivative_indices = np.array(df['Derivative'].tolist(), dtype=object)

# Ensure sequences are of integer type
function_indices = [np.array(seq, dtype=int) for seq in function_indices]
variable_indices = [np.array(seq, dtype=int) for seq in variable_indices]
derivative_indices = [np.array(seq, dtype=int) for seq in derivative_indices]

# Pad or truncate the sequences to the specified length
function_indices = tf.keras.preprocessing.sequence.pad_sequences(function_indices, maxlen=max_sequence_length, padding='post')
variable_indices = tf.keras.preprocessing.sequence.pad_sequences(variable_indices, maxlen=max_sequence_length, padding='post')
derivative_indices = tf.keras.preprocessing.sequence.pad_sequences(derivative_indices, maxlen=max_sequence_length, padding='post')

# Concatenate the sequences
target_indices = np.concatenate([np.zeros_like(derivative_indices[:, :1]), derivative_indices[:, :-1]], axis=1)

# Split data into train and validation sets
function_indices_train, function_indices_val, variable_indices_train, variable_indices_val, target_indices_train, target_indices_val = train_test_split(
    function_indices, variable_indices, target_indices, test_size=0.2, random_state=42
)

# Define model architecture
embedding_dim = 50
latent_dim = 100

input_function = Input(shape=(max_sequence_length,))
input_variable = Input(shape=(max_sequence_length,))
encoder_embedding = Embedding(num_tokens, embedding_dim, input_length=max_sequence_length)

function_embedding = encoder_embedding(input_function)
variable_embedding = encoder_embedding(input_variable)

encoder_lstm = LSTM(latent_dim, return_state=True)
_, function_state_h, function_state_c = encoder_lstm(function_embedding)
_, variable_state_h, variable_state_c = encoder_lstm(variable_embedding)

encoder_states = [function_state_h, function_state_c, variable_state_h, variable_state_c]

input_derivative = Input(shape=(max_sequence_length,))
decoder_embedding = Embedding(num_tokens, embedding_dim, input_length=max_sequence_length)

derivative_embedding = decoder_embedding(input_derivative)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(derivative_embedding, initial_state=encoder_states[:2])

decoder_dense = Dense(num_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([input_function, input_variable, input_derivative], decoder_outputs)

# Compile the model with accuracy as a metric
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model on the training set
model.fit([function_indices_train, variable_indices_train, target_indices_train], derivative_indices_train, epochs=10, batch_size=32)

# Evaluate the model on the validation set
loss, accuracy = model.evaluate([function_indices_val, variable_indices_val, target_indices_val], derivative_indices_val)

print(f'Validation Loss: {loss}, Validation Accuracy: {accuracy}')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Validation Loss: 0.01287948526442051, Validation Accuracy: 0.996431827545166
