In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from spektral.layers import GraphSageConv

# Function to get BERT embeddings in batches
def get_bert_embeddings_batch(tweets, tokenizer, model, max_length=128):
    tokenized = tokenizer(tweets, padding=True, truncation=True, max_length=max_length, return_tensors="tf")
    outputs = model(tokenized)
    last_hidden_state = outputs.last_hidden_state
    embeddings = last_hidden_state[:, 0, :]
    return embeddings

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Function to process each user's tweets and apply attention mechanism
def process_user_tweets(user_id, tweets_df, tokenizer, model, pbar):
    user_tweets = tweets_df[tweets_df['user_id'] == user_id]['text'].tolist()
    if not user_tweets:
        return None

    bert_embeddings = []
    batch_size = 100
    for i in range(0, len(user_tweets), batch_size):
        batch_tweets = user_tweets[i:i+batch_size]
        embeddings = get_bert_embeddings_batch(batch_tweets, tokenizer, model)
        bert_embeddings.append(embeddings)
        pbar.update(len(batch_tweets))

    bert_embeddings = tf.concat(bert_embeddings, axis=0)

    # Apply attention mechanism
    attention_weights = tf.keras.layers.Dense(units=1)(bert_embeddings)
    attention_weights = tf.squeeze(attention_weights, axis=-1)
    attention_weights = tf.nn.softmax(attention_weights, axis=0)
    attention_weighted_repr = tf.expand_dims(attention_weights, axis=-1) * bert_embeddings

    # Sum to get the user-level representation
    user_repr = tf.reduce_sum(attention_weighted_repr, axis=0)

    return user_repr

# Assuming df_train is your DataFrame
# Add a binary label column
df_train['label'] = df_train['type'].apply(lambda x: 0 if x == 'social spam' else 1)

df_train_sorted = df_train.sort_values(by='user_id', ascending=False)
unique_user_ids = df_train_sorted['user_id'].unique()
user_representations = []
statuses_count_list = []
followers_count_list = []
friends_count_list = []
favourites_count_list = []
labels_list = []

total_tweets = len(df_train_sorted)
with tqdm(total=total_tweets, desc='Processing all tweets') as pbar:
    for user_id in unique_user_ids:
        user_repr = process_user_tweets(user_id, df_train_sorted, tokenizer, bert_model, pbar)
        if user_repr is not None:
            user_representations.append(user_repr.numpy())
            user_data = df_train_sorted[df_train_sorted['user_id'] == user_id]
            statuses_count_list.append(user_data['statuses_count'].iloc[0])
            followers_count_list.append(user_data['followers_count'].iloc[0])
            friends_count_list.append(user_data['friends_count'].iloc[0])
            favourites_count_list.append(user_data['favourites_count'].iloc[0])
            labels_list.append(user_data['label'].iloc[0])

user_representations = np.stack(user_representations, axis=0)
statuses_count_array = np.array(statuses_count_list).reshape(-1, 1)
followers_count_array = np.array(followers_count_list).reshape(-1, 1)
friends_count_array = np.array(friends_count_list).reshape(-1, 1)
favourites_count_array = np.array(favourites_count_list).reshape(-1, 1)
labels_array = np.array(labels_list).reshape(-1, 1)

# Function to get adjacency matrix
def get_adj(enco):
    norm_enco = enco / np.linalg.norm(enco, axis=1, keepdims=True)
    similarity_matrix = cosine_similarity(norm_enco)
    adjacency_matrix = np.where(similarity_matrix > 0.8, 1, 0)
    adj_sparse = tf.convert_to_tensor(adjacency_matrix, dtype=tf.float32)
    adjacency_matrix_sparse = tf.sparse.from_dense(adj_sparse)
    return adjacency_matrix, adjacency_matrix_sparse

# Define GraphSAGE model
bert_pool = tf.keras.Input(shape=(768,), dtype=tf.float32, name="bert_pool")
adj_sp_tt = tf.keras.Input(shape=(None,), sparse=True, dtype=tf.float32, name="adj_sp_tt")
statuses_count_input = tf.keras.Input(shape=(1,), dtype=tf.float32, name="statuses_count_input")
followers_count_input = tf.keras.Input(shape=(1,), dtype=tf.float32, name="followers_count_input")
friends_count_input = tf.keras.Input(shape=(1,), dtype=tf.float32, name="friends_count_input")
favourites_count_input = tf.keras.Input(shape=(1,), dtype=tf.float32, name="favourites_count_input")

# Concatenate the attention-weighted representation with other features
rest_enc = tf.keras.layers.Concatenate()([statuses_count_input, followers_count_input, friends_count_input, favourites_count_input])
concatenated = tf.keras.layers.Concatenate()([rest_enc, bert_pool])

# Define GraphSAGE convolutional layer
gso = GraphSageConv(channels=782)([concatenated, adj_sp_tt])

do = tf.keras.layers.Dense(782, activation="relu", name="Dense_Layer_1")(gso)
do = tf.keras.layers.BatchNormalization()(do)
do = tf.keras.layers.Dropout(0.3)(do)

# Output layer
fdo = tf.keras.layers.Dense(2, activation="softmax", name="Dense_Layer_2")(do)

# Define the model
model = tf.keras.Model(inputs=[bert_pool, adj_sp_tt, statuses_count_input, followers_count_input, friends_count_input, favourites_count_input], outputs=fdo)

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
loss_fn = tf.keras.losses.CategoricalCrossentropy()
model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

# Prepare data for training
def create_batches(data, batch_size=32):
    num_samples = data.shape[0]
    num_batches = num_samples // batch_size
    batches = np.array_split(data[:num_batches * batch_size], num_batches)
    return batches

X_data = user_representations
X_out = labels_array
statuses_count = statuses_count_array
followers_count = followers_count_array
friends_count = friends_count_array
favourites_count = favourites_count_array

X_data_batches = create_batches(X_data, batch_size=32)
X_out_batches = create_batches(X_out, batch_size=32)
X_statuses_count_batches = create_batches(statuses_count, batch_size=32)
X_followers_count_batches = create_batches(followers_count, batch_size=32)
X_friends_count_batches = create_batches(friends_count, batch_size=32)
X_favourites_count_batches = create_batches(favourites_count, batch_size=32)

# Train the model
num_epochs = 5
device = '/GPU:0' if tf.config.experimental.list_physical_devices('GPU') else '/CPU:0'

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")

    total_correct = 0
    total_samples = 0
    total_loss = 0

    for i in tqdm(range(len(X_data_batches))):
        batch_data = tf.convert_to_tensor(X_data_batches[i], dtype=tf.float32)
        batch_out = tf.convert_to_tensor(X_out_batches[i], dtype=tf.int32)
        batch_statuses = tf.convert_to_tensor(X_statuses_count_batches[i], dtype=tf.float32)
        batch_followers = tf.convert_to_tensor(X_followers_count_batches[i], dtype=tf.float32)
        batch_friends_count = tf.convert_to_tensor(X_friends_count_batches[i], dtype=tf.float32)
        batch_favourites_count = tf.convert_to_tensor(X_favourites_count_batches[i], dtype=tf.float32)

        enc_bert_pool = batch_data
        statuses_count = batch_statuses
        followers_count = batch_followers
        friends_count = batch_friends_count
        favourites_count = batch_favourites_count

        enc = tf.concat([enc_bert_pool, statuses_count, followers_count, friends_count, favourites_count], axis=-1)
        adj, adj_sparse = get_adj(enc)

        batch_out_one_hot = tf.squeeze(tf.one_hot(batch_out, depth=2), axis=1)

        with tf.device(device):
            predictions = model([batch_data, adj_sparse, batch_statuses, batch_followers, batch_friends_count, batch_favourites_count], training=True)
        loss_value = loss_fn(batch_out_one_hot, predictions)

        total_loss += loss_value * len(batch_data)
        total_correct += tf.reduce_sum(tf.cast(tf.equal(tf.argmax(predictions, axis=1), tf.cast(batch_out, tf.int64)), tf.float32))
        total_samples += len(batch_data)

    epoch_accuracy = total_correct / total_samples
    epoch_loss = total_loss / len(X_data)
    print(f"Training Accuracy: {epoch_accuracy.numpy():.4f}")
    print(f"Training Loss: {epoch_loss.numpy():.4f}")
