In [1]:
pip install flask

Note: you may need to restart the kernel to use updated packages.


In [6]:
import pandas as pd
import re
from flask import Flask, request, jsonify
from transformers import RobertaTokenizer, TFRobertaModel
import tensorflow as tf
import joblib
from validate_email import validate_email
from tqdm import tqdm
import requests
import nest_asyncio

# Apply the asyncio patch for Jupyter
nest_asyncio.apply()

# Initialize Flask app
app = Flask(__name__)

# Load models and tokenizer
rf_model = joblib.load('random_forest_model.pkl')
meta_classifier = joblib.load('meta_classifier_model.pkl')
roberta_model = TFRobertaModel.from_pretrained('roberta_model')
tokenizer = RobertaTokenizer.from_pretrained('roberta_tokenizer')

# Load metadata columns
with open('metadata_columns.txt', 'r') as f:
    metadata_columns = f.read().splitlines()

# Function to validate email with timeout handling
def validate_email_with_timeout(email):
    try:
        return validate_email(email_address=email, check_format=True, check_blacklist=True, check_dns=True, check_smtp=False, smtp_debug=False)
    except Exception as e:
        return False

# Function to encode texts using RoBERTa tokenizer
def encode_texts(texts, tokenizer, max_length=128):
    return tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='tf')

# Function to generate RoBERTa embeddings
def get_roberta_embeddings(encoded_texts, roberta_model, batch_size=32):
    embeddings = []
    for i in tqdm(range(0, len(encoded_texts['input_ids']), batch_size), desc="Generating embeddings"):
        batch = {key: val[i:i+batch_size] for key, val in encoded_texts.items()}
        outputs = roberta_model(batch)
        embeddings.append(outputs.last_hidden_state[:, 0, :].numpy())
    return tf.convert_to_tensor(np.concatenate(embeddings, axis=0))

@app.route('/predict', methods=['POST'])
def predict():
    data = request.json
    df = pd.DataFrame(data)
    
    # Data preprocessing
    df['sender_email'] = df['sender'].apply(lambda x: re.findall(r'<(.*?)>', x)[0] if re.findall(r'<(.*?)>', x) else x)
    df['date'] = pd.to_datetime(df['date'], errors='coerce', utc=True)
    df = df.dropna(subset=['date'])
    df['domain'] = df['sender_email'].apply(lambda x: x.split('@')[-1])
    df['day_of_week'] = df['date'].dt.dayofweek
    df['hour'] = df['date'].dt.hour
    
    # Validate emails
    df['email_validity'] = df['sender_email'].apply(validate_email_with_timeout)
    
    # Encode texts
    encoded_subjects = encode_texts(df['subject'].astype(str).tolist(), tokenizer, max_length=64)
    encoded_bodies = encode_texts(df['body'].astype(str).tolist(), tokenizer, max_length=64)
    
    # Generate embeddings
    embeddings_subjects = get_roberta_embeddings(encoded_subjects, roberta_model, batch_size=16)
    embeddings_bodies = get_roberta_embeddings(encoded_bodies, roberta_model, batch_size=16)
    embeddings = tf.concat([embeddings_subjects, embeddings_bodies], axis=1)
    
    # Prepare metadata features
    X_meta = df[['domain', 'day_of_week', 'hour', 'email_validity', 'urls']]
    X_meta = pd.get_dummies(X_meta)
    
    # Ensure the columns match those used during training
    X_meta = X_meta.reindex(columns=metadata_columns, fill_value=0)
    X_meta.columns = X_meta.columns.astype(str)
    
    # Combine metadata features and embeddings
    meta_features = pd.concat([pd.DataFrame(embeddings.numpy()), pd.DataFrame(X_meta.reset_index(drop=True))], axis=1)
    meta_features.columns = meta_features.columns.astype(str)
    
    # Predict using the meta-classifier
    y_pred = meta_classifier.predict(meta_features)
    
    # Send alert to Wazuh
    for index, row in df.iterrows():
        alert_data = {
            "rule": {
                "level": 10,
                "description": "Email classified as spam" if y_pred[index] == 1 else "Email classified as not spam",
            },
            "agent": {
                "id": "001",
                "name": "EmailClassifier",
            },
            "manager": {
                "name": "LocalManager"
            },
            "id": "123456",
            "full_log": row.to_json(),
        }
        headers = {'Content-Type': 'application/json'}
        response = requests.post('http://wazuh-manager:55000/alerts', headers=headers, json=alert_data)
        print(f"Alert sent to Wazuh: {response.status_code}")
    
    return jsonify({'predictions': y_pred.tolist()})

# Run the Flask app on a different port
app.run(port=5003, debug=True)


All model checkpoint layers were used when initializing TFRobertaModel.

All the layers of TFRobertaModel were initialized from the model checkpoint at roberta_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5003
[33mPress CTRL+C to quit[0m
 * Restarting with stat
Traceback (most recent call last):
  File "/Users/abd/.pyenv/versions/3.8.10/lib/python3.8/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/abd/.pyenv/versions/3.8.10/lib/python3.8/site-packages/traitlets/config/application.py", line 1074, in launch_instance
    app.initialize(argv)
  File "/Users/abd/.pyenv/versions/3.8.10/lib/python3.8/site-packages/traitlets/config/application.py", line 118, in inner
    return method(app, *args, **kwargs)
  File "/Users/abd/.pyenv/versions/3.8.10/lib/python3.8/site-packages/ipykernel/kernelapp.py", line 692, in initialize
    self.init_sockets()
  File "/Users/abd/.pyenv/versions/3.8.10/lib/python3.8/site-packages/ipykernel/kernelapp.py", line 331, in init_sockets
    self.shell_port = self._bind_socket(self.shell_socket, self.shell_port)
  File "/Users/abd/.pyenv/versions/3.8.10/lib/python3.8/site-package

SystemExit: 1