In [None]:
# Import necessary libraries/packages etc. 
import pandas as pd
import numpy as np
import string
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Input, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords

# Download English stopwords (once only)
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
#print(stop_words)

import spacy
spacy.cli.download("en_core_web_sm") # downloads (only on first execution) the small English language model
nlp = spacy.load("en_core_web_sm") # Loads the English language model in your memory for further usage

# Use a pre trained vectorizer 
import fasttext
import fasttext.util

# Download the FastText model for English (only downloads if not already present)
fasttext.util.download_model('en', if_exists='ignore')  # Downloads 'cc.en.300.bin' to the current directory
fasttext_model = fasttext.load_model("cc.en.300.bin") # loads the local model into the memory for usage

In [None]:
# Step 1: Load data - Table with 'Text' and 'Department'
df = pd.read_csv('email_dataset.csv', sep=';')

# Step 2: Define features and labels from the loaded data
X = df['Emailtext']
y = df['Department']

print(X)
print("\n transformed to: \n")

# Step 2: Text preprocessing: lowercase, remove stopwords and punctuation
stop_words = set(spacy.lang.en.stop_words.STOP_WORDS)  # English stopwords

def preprocess_text(text):
    # Convert to lowercase, remove punctuation and stopwords
    text = text.lower()  # Convert to lowercase
    text = ''.join([char for char in text if char not in string.punctuation])  # Remove punctuation
    words = text.split()  # Split text into words
    filtered_words = [word for word in words if word not in stop_words]  # Remove stopwords
    return ' '.join(filtered_words)

# Apply text preprocessing to the entire dataset
X_processed = X.apply(preprocess_text)

print(X_processed)

In [None]:
# Check distribution of departments
class_distribution = df['Department'].value_counts()
print(class_distribution)

In [None]:
# Labels (departments) are converted to numerical values
label_to_index = {label: idx for idx, label in enumerate(set(y))}  # Map label -> number
index_to_label = {idx: label for label, idx in label_to_index.items()}  # Map number -> label
numerical_labels = [label_to_index[label] for label in y]  # Create numerical labels

# ### Basics ###
# FastText generates vectors for individual words, not for entire sentences or texts.
# A text (e.g., a sentence or an email) consists of multiple words, and each word has its own vector in the vector space.
# To use a text as input for a neural network, it needs to be reduced to a fixed dimension 
# (e.g., a 300-dimensional vector if FastText vectors have 300 dimensions).
# --> This functions therefore calculates a single vector as the representation of an entire text
def fasttext_embedding(texts):
    embeddings = []
    for text in texts:
        # For each word in the text, get the FastText vector and calculate the average
        word_vecs = [fasttext_model.get_word_vector(word) for word in text.split()]
        embeddings.append(np.mean(word_vecs, axis=0))  # Average the word vectors
    return np.array(embeddings)

# Convert texts to FastText vectors
X = fasttext_embedding(X)

# Convert the labels to categorical variables (for multi-class classification)
y = to_categorical(numerical_labels, num_classes=len(label_to_index))

In [None]:
# Step 3: Split the data
# First, Split the data into training and testing datasets (70% training, 30% testing)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Then split validation (50%) and test data (50%) from X_temp
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print(f"Training data: {X_train.shape}, Validation data: {X_val.shape}, Test data: {X_test.shape}")


In [None]:
# Step 4: Create the model
# Building a simple neural network that works with FastText vectors (dense vectors)
# The Dense layers take FastText vectors (numerical representations of text) from the Input Layer
# and process them to learn more complex patterns or relationships.
# Progressive reduction in the number of neurons (e.g., from 128 to 64)
# The first layer learns many features, and the second layer condenses the most essential ones.

model = Sequential([
    Input(shape=(X_train.shape[1],)),  # Define the size/shape of the Input Layer (dimension of FastText vectors)
    # Dense layers connect every neuron in their layer to every neuron in the next layer.
    # They are suitable for text classification; for image processing or time series, other layers are better suited!
    Dense(128, activation='relu'),  # Dense layer for FastText vectors
    Dropout(0.5),  # Dropout randomly deactivates 50% of neurons during training --> prevents overfitting on training data
    # Trade-off between performance and overfitting
    # 128 and 64: These values are a good starting point for many standard tasks.
    Dense(64, activation='relu'),  # Another Dense layer
    Dropout(0.5),  # Another Dropout layer
    # This layer has as many neurons as classes (e.g., departments in your case).
    # The Softmax activation function ensures each class receives a probability, and the sum of all probabilities is 1.
    # The class with the highest probability is the predicted class.
    Dense(len(label_to_index), activation='softmax')  # Output layer (softmax for multi-class classification)
])

# If more capacity is required to increase accuracy, an additional Dense layer (e.g., with 32 neurons) can be added.

# Optimizer: The Adam optimizer adjusts the model's weights to improve predictions.
# Loss: Categorical cross-entropy compares predicted probabilities with actual classes.
# Metrics: Accuracy measures how often the model predicts the correct class.

# Compile the model: Specify the optimizer, loss function, and evaluation metrics
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Adding a callback like EarlyStopping or ModelCheckpoint 
# to stop training if the validation accuracy or loss stagnates:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_model.keras', monitor='val_loss', save_best_only=True)

# Step 5: Train the model
# Train the model with the training data
history = model.fit(
    X_train,  # Training data
    y_train,  # Training labels
    validation_data=(X_val, y_val), # Validation data
    epochs=20,  # Number of training iterations
    batch_size=32,  # Number of samples per batch
    verbose=2,  # Display training progress
    callbacks=[early_stopping, model_checkpoint]  # Callbacks for smarter training
)

In [None]:
# Step 6: Predicting on new data
# Example emails for classification
new_emails = [
    "Please set up a new account for me.",  # Example 1
    "When is the next sales meeting?"  # Example 2
]

# Prepare new emails: FastText vectorization
new_X = fasttext_embedding(new_emails)

# Make predictions for the new emails
predictions = model.predict(new_X)

# Output the results
for email, pred in zip(new_emails, predictions):
    predicted_label = index_to_label[np.argmax(pred)]  # Select the department with the highest probability
    print(f"Email: '{email}' -> Department: {predicted_label}")


In [None]:
model.save("my_first_neural_network.keras")

In [None]:
#Load the Saved Model Later
from tensorflow.keras.models import load_model
model = load_model("my_first_neural_network.keras")

# rerun the step 6 above to check if the loaded model works!