In [None]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

# Load the dataset and skip problematic rows
try:
    data = pd.read_csv('errors.csv', on_bad_lines='skip')
except pd.errors.ParserError as e:
    print(f"ParserError: {e}")
    
# Check the shape of the dataset
print(f"Shape of the dataset: {data.shape}")

# Check the column names
print(f"Column names: {data.columns}")

# Get a summary of the dataset
print(data.info())

# View the first few rows of the dataset
print(data.head())

# Check for missing values
print(data.isnull().sum())

# Drop rows with missing values
data.dropna(subset=['Error Words', 'Words'], inplace=True)

# Sample 500,000 data points from the dataset
data = data.sample(n=500000, random_state=42)

# Get descriptive statistics
print(data.describe())

# Get the number of unique values in each column
print(data.nunique())

# Prepare the tokenizer
tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(data['Error Words'].astype(str).tolist() + data['Words'].astype(str).tolist())

# Convert texts to sequences
X_seq = tokenizer.texts_to_sequences(data['Error Words'].astype(str))
y_seq = tokenizer.texts_to_sequences(data['Words'].astype(str))

# Pad the sequences
max_seq_length = max(max(len(seq) for seq in X_seq), max(len(seq) for seq in y_seq))
X_seq_padded = tf.keras.preprocessing.sequence.pad_sequences(X_seq, maxlen=max_seq_length, padding='post')
y_seq_padded = tf.keras.preprocessing.sequence.pad_sequences(y_seq, maxlen=max_seq_length, padding='post')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_seq_padded, y_seq_padded, test_size=0.1, random_state=42)


In [None]:
# Define the enhanced LSTM model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=256, input_length=max_seq_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(512, return_sequences=True)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(512, return_sequences=True)),
    tf.keras.layers.LayerNormalization(),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(len(tokenizer.word_index) + 1, activation='softmax'))
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print model summary
model.summary()



In [None]:
# Prepare target data for training (one-hot encoding)
y_train_reshaped = y_train[..., tf.newaxis]
y_test_reshaped = y_test[..., tf.newaxis]

# Create data pipeline for efficient training
def data_generator(X, y, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices((X, y))
    dataset = dataset.shuffle(buffer_size=10000)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return dataset

batch_size = 1000
train_dataset = data_generator(X_train, y_train_reshaped, batch_size)
test_dataset = data_generator(X_test, y_test_reshaped, batch_size)

# Train the model
history = model.fit(train_dataset, epochs=20, validation_data=test_dataset)



In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(test_dataset)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

In [None]:
# Save the model in Keras format
model.save('word_error_correction_model.h5')