In [None]:
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dropout, Dense, SimpleRNN
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical

from sklearn.linear_model import LogisticRegression

from tensorflow.keras.layers import GRU

# Load and Clean

In [None]:
df = pd.read_excel("email_auto_responder_dataset_17000.xlsx")

In [None]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def strip_polite_suffix(text):
    suffixes = ['asap', 'thanks', 'urgently', 'please assist', 'now', 'if possible', 'please']
    words = text.split()
    while words and words[-1] in suffixes:
        words.pop()
    return ' '.join(words)

df['customer_email'] = df['customer_email'].apply(clean_text)
df['support_reply'] = df['support_reply'].apply(lambda x: strip_polite_suffix(clean_text(x)))
df.dropna(subset=['customer_email', 'support_reply'], inplace=True)


# Label Encoding

In [None]:
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['support_reply'])
num_classes = len(label_encoder.classes_)


# Tokenization

In [None]:
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['customer_email'])
sequences = tokenizer.texts_to_sequences(df['customer_email'])
padded = pad_sequences(sequences, maxlen=50, padding='post')
vocab_size = min(len(tokenizer.word_index) + 1, 5000)


# Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(padded, df['label'], test_size=0.2, random_state=42)
y_train_cat = to_categorical(y_train, num_classes=num_classes)
y_test_cat = to_categorical(y_test, num_classes=num_classes)

# SVM

In [None]:
X_train_svm, X_test_svm, y_train_svm, y_test_svm = train_test_split(df['customer_email'], df['label'], test_size=0.2, random_state=42)

svm_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('svm', SVC(kernel='linear', probability=True))
])
svm_pipeline.fit(X_train_svm, y_train_svm)
y_pred_svm = svm_pipeline.predict(X_test_svm)
print("✅ SVM Accuracy:", accuracy_score(y_test_svm, y_pred_svm))
print("📊 SVM Classification Report:\n", classification_report(y_test_svm, y_pred_svm, target_names=label_encoder.classes_))


✅ SVM Accuracy: 0.9514705882352941
📊 SVM Classification Report:
                                                                                                    precision    recall  f1-score   support

            please follow the instructions in the email to unlock your account or contact support       0.94      1.00      0.97       303
please follow the instructions in the email to unlock your account or contact support if possible       0.00      0.00      0.00        19
                                           sure please click the forgot password link to reset it       0.96      1.00      0.98       334
                               sure please click the forgot password link to reset it if possible       0.00      0.00      0.00        13
                      we apologize for the inconvenience please return the item for a replacement       0.95      1.00      0.98       335
          we apologize for the inconvenience please return the item for a replacement if possible   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# RNN

In [None]:
rnn_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=64),
    SimpleRNN(64),
    Dense(num_classes, activation='softmax')
])
rnn_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
rnn_model.fit(X_train, y_train_cat, epochs=5, batch_size=64, validation_split=0.1, verbose=0)
rnn_preds = np.argmax(rnn_model.predict(X_test), axis=1)
print("\n✅ RNN Accuracy:", accuracy_score(y_test, rnn_preds))
print("📊 RNN Classification Report:\n", classification_report(y_test, rnn_preds, target_names=label_encoder.classes_, labels=np.unique(rnn_preds)))


[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step

✅ RNN Accuracy: 0.9514705882352941
📊 RNN Classification Report:
                                                                                                    precision    recall  f1-score   support

            please follow the instructions in the email to unlock your account or contact support       0.94      1.00      0.97       303
please follow the instructions in the email to unlock your account or contact support if possible       0.96      1.00      0.98       334
                                           sure please click the forgot password link to reset it       0.95      1.00      0.98       335
                               sure please click the forgot password link to reset it if possible       0.94      1.00      0.97       342
                      we apologize for the inconvenience please return the item for a replacement       0.94      1.00      0.97       332
          we apologize f



# LSTM

In [None]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=64),
    Bidirectional(LSTM(64, return_sequences=False)),
    Dropout(0.4),
    Dense(num_classes, activation='softmax')
])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

early_stop = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)
model.fit(X_train, y_train_cat, epochs=20, batch_size=64, validation_split=0.1, callbacks=[early_stop], verbose=1)

y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)
print("\n✅ LSTM Accuracy:", accuracy_score(y_test, y_pred))
print("📊 LSTM Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Epoch 1/20
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 77ms/step - accuracy: 0.5653 - loss: 1.6921 - val_accuracy: 0.9434 - val_loss: 0.2701
Epoch 2/20
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 96ms/step - accuracy: 0.9480 - loss: 0.2690 - val_accuracy: 0.9434 - val_loss: 0.2398
Epoch 3/20
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 76ms/step - accuracy: 0.9476 - loss: 0.2403 - val_accuracy: 0.9434 - val_loss: 0.2266
Epoch 4/20
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 74ms/step - accuracy: 0.9498 - loss: 0.2248 - val_accuracy: 0.9434 - val_loss: 0.2244
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step

✅ LSTM Accuracy: 0.9514705882352941
📊 LSTM Classification Report:
                                                                                                    precision    recall  f1-score   support

            please follow the instructions in the email to u

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Logistic Regression

In [None]:

logreg_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('logreg', LogisticRegression(max_iter=1000, random_state=42))
])

logreg_pipeline.fit(X_train_svm, y_train_svm)
y_pred_logreg = logreg_pipeline.predict(X_test_svm)

logreg_acc = accuracy_score(y_test_svm, y_pred_logreg)
print("✅ Logistic Regression Accuracy:", logreg_acc)
print("📊 Logistic Regression Classification Report:\n", classification_report(y_test_svm, y_pred_logreg, target_names=label_encoder.classes_))

✅ Logistic Regression Accuracy: 0.9514705882352941
📊 Logistic Regression Classification Report:
                                                                                                    precision    recall  f1-score   support

            please follow the instructions in the email to unlock your account or contact support       0.94      1.00      0.97       303
please follow the instructions in the email to unlock your account or contact support if possible       0.00      0.00      0.00        19
                                           sure please click the forgot password link to reset it       0.96      1.00      0.98       334
                               sure please click the forgot password link to reset it if possible       0.00      0.00      0.00        13
                      we apologize for the inconvenience please return the item for a replacement       0.95      1.00      0.98       335
          we apologize for the inconvenience please return the item 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# GRU

In [None]:
gru_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=64, input_length=50),
    GRU(64),
    Dense(num_classes, activation='softmax')
])

gru_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

gru_model.fit(X_train, y_train_cat, epochs=5, batch_size=64, validation_split=0.1, verbose=1)

y_pred_gru = gru_model.predict(X_test)
y_pred_gru_labels = np.argmax(y_pred_gru, axis=1)

gru_acc = accuracy_score(y_test, y_pred_gru_labels)
print("✅ GRU Accuracy:", gru_acc)
print("📊 GRU Classification Report:\n", classification_report(y_test, y_pred_gru_labels, target_names=label_encoder.classes_))

Epoch 1/5




[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 46ms/step - accuracy: 0.1096 - loss: 2.6117 - val_accuracy: 0.8338 - val_loss: 1.1439
Epoch 2/5
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 51ms/step - accuracy: 0.9414 - loss: 0.5945 - val_accuracy: 0.9434 - val_loss: 0.2674
Epoch 3/5
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 53ms/step - accuracy: 0.9479 - loss: 0.2411 - val_accuracy: 0.9434 - val_loss: 0.2368
Epoch 4/5
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 53ms/step - accuracy: 0.9442 - loss: 0.2324 - val_accuracy: 0.9434 - val_loss: 0.2294
Epoch 5/5
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 48ms/step - accuracy: 0.9459 - loss: 0.2212 - val_accuracy: 0.9434 - val_loss: 0.2261
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step
✅ GRU Accuracy: 0.9514705882352941
📊 GRU Classification Report:
                                                       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
