In [2]:
import pyreadr
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.regularizers import l2

# Load the .RData files
result_clean = pyreadr.read_r("/Users/wentaozhang/Documents/GitHub/module-2-group4/data/claims-clean-example.RData")
result_test = pyreadr.read_r("/Users/wentaozhang/Documents/GitHub/module-2-group4/data/claims-test.RData")

# Extract objects as pandas DataFrames
claims_clean = result_clean["claims_clean"]
claims_test = result_test["claims_test"]

# Preprocess text data
def preprocess_text(df, text_column):
    return df[text_column].str.replace(r"<.*?>", " ", regex=True) \
                          .str.replace(r"\s+", " ", regex=True) \
                          .str.strip()

claims_clean["text_clean"] = preprocess_text(claims_clean, "text_tmp")
claims_test["text_clean"] = preprocess_text(claims_test, "text_tmp")

# Binary labels
claims_clean["bclass"] = claims_clean["bclass"].astype("category").cat.codes

# Tokenize text data
tokenizer = Tokenizer(num_words=10000)  # Use top 10,000 words
tokenizer.fit_on_texts(claims_clean["text_clean"])

# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(claims_clean["text_clean"])
test_sequences = tokenizer.texts_to_sequences(claims_test["text_clean"])

# Pad sequences
maxlen = 200
train_padded = pad_sequences(train_sequences, maxlen=maxlen, padding="post")
test_padded = pad_sequences(test_sequences, maxlen=maxlen, padding="post")

# Convert labels to categorical
train_labels = tf.keras.utils.to_categorical(claims_clean["bclass"], num_classes=2)

# Define the binary classification model
model = Sequential([
    Embedding(input_dim=10000, output_dim=100, input_length=maxlen),
    LSTM(units=128, dropout=0.2, recurrent_dropout=0.2, kernel_regularizer=l2(0.01)),
    Dense(units=64, activation="relu", kernel_regularizer=l2(0.01)),
    Dense(units=2, activation="softmax")  # Binary classification
])

# Compile the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Train the model
history = model.fit(
    train_padded,
    train_labels,
    validation_split=0.2,
    epochs=10,
    batch_size=64,
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=3)]
)

# Predict on the test set
predictions = model.predict(test_padded)

# Convert probabilities to binary predictions
predicted_classes = np.argmax(predictions, axis=1)

# Save predictions to a CSV file
pred_df = pd.DataFrame({
    ".id": claims_test[".id"],
    "bclass.pred": predicted_classes
})
pred_df.to_csv("/Users/wentaozhang/Documents/GitHub/module-2-group4/results/predictions_binary.csv", index=False)

# Save the trained model
model.save("/Users/wentaozhang/Documents/GitHub/module-2-group4/results/model_binary.h5")

print("Model and predictions saved successfully.")


Epoch 1/10




[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 221ms/step - accuracy: 0.5216 - loss: 2.8531 - val_accuracy: 0.6776 - val_loss: 1.8961
Epoch 2/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 243ms/step - accuracy: 0.6630 - loss: 1.6839 - val_accuracy: 0.5841 - val_loss: 1.2340
Epoch 3/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 198ms/step - accuracy: 0.7118 - loss: 1.1077 - val_accuracy: 0.6752 - val_loss: 0.8964
Epoch 4/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 200ms/step - accuracy: 0.7850 - loss: 0.7616 - val_accuracy: 0.7033 - val_loss: 0.7549
Epoch 5/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 221ms/step - accuracy: 0.8106 - loss: 0.6255 - val_accuracy: 0.6986 - val_loss: 0.6881
Epoch 6/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 224ms/step - accuracy: 0.8066 - loss: 0.5343 - val_accuracy: 0.7360 - val_loss: 0.6352
Epoch 7/10
[1m27/27[0m [32m━━━━━━━━━



Model and predictions saved successfully.


- **Accuracy Improvement**:
  - Training accuracy started at approximately 52.16% and improved to 85.49% by the 10th epoch.
  - This indicates the model is learning and capturing patterns effectively from the training data.

- **Loss Reduction**:
  - The training loss began at 2.8531 and decreased significantly to 0.4033 by the final epoch.
  - A decrease in loss suggests the model is minimizing the error in its predictions over the training data.

- **Validation Accuracy and Loss**:
  - Validation accuracy started at 67.76% and ended at 72.90%.
  - Validation loss decreased consistently, ending at 0.6575, which indicates the model generalizes relatively well on unseen data.
  - However, validation accuracy is slightly lower than training accuracy, which might suggest some overfitting.
