<h3>Importing Necessary Packages</h3>

In [1]:
import re
import os
import csv
import torch
import random
import string
import numpy as np
import unicodedata
import contractions
import pandas as pd
from io import StringIO
from torch_geometric.data import Data
from sklearn.preprocessing import LabelEncoder
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from torch_geometric.transforms import RandomNodeSplit, RandomLinkSplit
from torch_geometric.nn import GCNConv, GATConv,GraphSAGE
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

  from .autonotebook import tqdm as notebook_tqdm


<h3>Importing raw twitter data</h3>

In [2]:
TWITTER_RAW_DATA = os.path.join(os.getcwd(), "TwitterDataset.csv")
TWITTER_RAW_DATA

'd:\\Projects\\Machine Learning\\Few-Shot-GNN-LLM\\data-related\\TwitterDataset.csv'

<h3>Text regularization</h3>

In [3]:
def text_regularization(text):
    try:
        text = str(text)
        # Convert to lowercase
        text = text.lower()

        # Expand contractions, can't => cannot
        text = contractions.fix(text)

        # Remove punctuations
        text = text.translate(str.maketrans('', '', string.punctuation))

        # Remove special characters
        text = re.sub(r'[^a-z\s]', '', text)

        # Normalize accented characters "café" → "cafe"
        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

        # Remove extra white spaces
        text = " ".join(text.split())
        return text
    except Exception as e:
        print('text_regularization', e)
        

<h3>Preprocessing raw twitter data to find features and labels</h3>

In [4]:
def preprocess_data(dataset_path = TWITTER_RAW_DATA):
    try:
        twitter_raw_data = pd.read_csv(dataset_path, encoding="ISO-8859-1")
        twitter_data = twitter_raw_data[twitter_raw_data["gender:confidence"] >= 0.5]
        columns_to_keep = ['gender', 'description']
        columns_to_drop = [col for col in twitter_data if col not in columns_to_keep]
        twitter_data = twitter_data.drop(columns=columns_to_drop)
        twitter_data = twitter_data.dropna()
        twitter_data = twitter_data[twitter_data['gender'].isin(['male', 'female'])]
        feature = twitter_data['description']
        feature = [text_regularization(each) for each in feature]
        label = twitter_data['gender'].to_list()
        return [feature, label]
    except Exception as e:
        print('preprocess_data', e)
data = preprocess_data()

<h3>Path for pre processed twitter data</>

In [5]:
TWITTER_PROCESSED_DATA = os.path.join(os.getcwd(), "TwitterProcessedDataset.csv")

<h3>Storing pre processed twitter data</h3>

In [6]:
df = pd.DataFrame(data[0], columns=['feature'])
df['label'] = data[1]
df.to_csv(TWITTER_PROCESSED_DATA, index=False)

<h3>Reading pre processed twitter data</h3>

In [7]:
twitter_data = pd.read_csv(TWITTER_PROCESSED_DATA)  
twitter_data = twitter_data.dropna()

<h3>Sentence Bert for feature embedding</h3>

In [8]:
sbert_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [9]:
feature_embeddings  = sbert_model.encode(twitter_data['feature'].tolist(), convert_to_tensor=True)

In [None]:
feature_embeddings

tensor([[-0.0030, -0.0812,  0.0170,  ...,  0.0391,  0.0038, -0.1770],
        [ 0.0254, -0.0611,  0.0312,  ..., -0.0036,  0.0272, -0.0790],
        [ 0.0242, -0.0003,  0.0522,  ..., -0.0283, -0.0319,  0.0106],
        ...,
        [-0.0991,  0.0148,  0.0329,  ...,  0.0250,  0.0063, -0.0639],
        [-0.0535, -0.0089,  0.0194,  ...,  0.1068, -0.0364,  0.0584],
        [-0.0138,  0.0522,  0.0515,  ...,  0.0227, -0.0691, -0.0231]],
       device='cuda:0')

<h3>Building edges using cosine similarity to generate graph</h3>

In [12]:
def build_edges(features, threshold=0.5):
    similarity_matrix = cosine_similarity(features)
    edges = []

    for i in range(len(features)):
        for j in range(i, len(features)):
            if similarity_matrix[i][j] > threshold:
                edges.append((i, j))
                edges.append((j, i))
    return edges

edges = build_edges(feature_embeddings.to("cpu"), threshold=0.8)
edges

[(0, 0),
 (0, 0),
 (1, 1),
 (1, 1),
 (2, 2),
 (2, 2),
 (3, 3),
 (3, 3),
 (4, 4),
 (4, 4),
 (5, 5),
 (5, 5),
 (6, 6),
 (6, 6),
 (7, 7),
 (7, 7),
 (8, 8),
 (8, 8),
 (9, 9),
 (9, 9),
 (10, 10),
 (10, 10),
 (11, 11),
 (11, 11),
 (12, 12),
 (12, 12),
 (13, 13),
 (13, 13),
 (14, 14),
 (14, 14),
 (15, 15),
 (15, 15),
 (16, 16),
 (16, 16),
 (17, 17),
 (17, 17),
 (18, 18),
 (18, 18),
 (19, 19),
 (19, 19),
 (20, 20),
 (20, 20),
 (21, 21),
 (21, 21),
 (22, 22),
 (22, 22),
 (23, 23),
 (23, 23),
 (24, 24),
 (24, 24),
 (25, 25),
 (25, 25),
 (26, 26),
 (26, 26),
 (27, 27),
 (27, 27),
 (28, 28),
 (28, 28),
 (29, 29),
 (29, 29),
 (30, 30),
 (30, 30),
 (31, 31),
 (31, 31),
 (32, 32),
 (32, 32),
 (33, 33),
 (33, 33),
 (34, 34),
 (34, 34),
 (35, 35),
 (35, 35),
 (36, 36),
 (36, 36),
 (37, 37),
 (37, 37),
 (38, 38),
 (38, 38),
 (39, 39),
 (39, 39),
 (40, 40),
 (40, 40),
 (41, 41),
 (41, 41),
 (42, 42),
 (42, 42),
 (42, 4540),
 (4540, 42),
 (43, 43),
 (43, 43),
 (44, 44),
 (44, 44),
 (45, 45),
 (45, 45),
 (

<h3>Converting labels to numeric value</h3>

In [13]:
label_encoder = LabelEncoder()
label_encoded = label_encoder.fit_transform(twitter_data['label'].tolist())
label_tensor = torch.tensor(label_encoded, dtype=torch.long).unsqueeze(1)
label_tensor

tensor([[1],
        [1],
        [1],
        ...,
        [1],
        [0],
        [0]])

In [14]:
node_features = feature_embeddings
node_features

tensor([[-0.0030, -0.0812,  0.0170,  ...,  0.0391,  0.0038, -0.1770],
        [ 0.0254, -0.0611,  0.0312,  ..., -0.0036,  0.0272, -0.0790],
        [ 0.0242, -0.0003,  0.0522,  ..., -0.0283, -0.0319,  0.0106],
        ...,
        [-0.0991,  0.0148,  0.0329,  ...,  0.0250,  0.0063, -0.0639],
        [-0.0535, -0.0089,  0.0194,  ...,  0.1068, -0.0364,  0.0584],
        [-0.0138,  0.0522,  0.0515,  ...,  0.0227, -0.0691, -0.0231]],
       device='cuda:0')

<h3>Converting edges to edge index to make graph data</h3>

In [15]:
edge_index = torch.tensor(edges, dtype=torch.long)
edge_index

tensor([[    0,     0],
        [    0,     0],
        [    1,     1],
        ...,
        [10748, 10748],
        [10749, 10749],
        [10749, 10749]])

In [16]:
node_label = label_tensor

In [17]:
edge_index = edge_index.t().contiguous()

<h3>Making graph data to train and evaluate</h3>

In [18]:
graph_data = Data(x=node_features, edge_index=edge_index, y=node_label)
graph_data

Data(x=[10750, 384], edge_index=[2, 23808], y=[10750, 1])

<h3>Splitting data in to train, validation and test</h3>

In [19]:
seed = 42
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

splitter = RandomNodeSplit(split="train_rest", num_val=0.2, num_test=0.2)
graph_data = splitter(graph_data)


In [20]:
# Access masks for each set
train_mask = graph_data.train_mask
val_mask = graph_data.val_mask
test_mask = graph_data.test_mask


In [21]:
graph_data.edge_index

tensor([[    0,     0,     1,  ..., 10748, 10749, 10749],
        [    0,     0,     1,  ..., 10748, 10749, 10749]])

In [22]:
train_nodes = train_mask.nonzero().flatten()
val_nodes = val_mask.nonzero().flatten()
test_nodes = test_mask.nonzero().flatten()
print(train_nodes.shape, val_nodes.shape, test_nodes.shape)

torch.Size([6450]) torch.Size([2150]) torch.Size([2150])


<h3>Splitting edges to train, validation and test accourding to train, validation and test nodes</h3>

In [23]:

edge_splitter = RandomLinkSplit(num_val=0.2, num_test=0.2)
train_data, val_data, test_data = edge_splitter(graph_data)

In [24]:
print(train_data)
print(val_data)
print(test_data)

Data(x=[10750, 384], edge_index=[2, 14286], y=[10750, 1], train_mask=[10750], val_mask=[10750], test_mask=[10750], edge_label=[28572], edge_label_index=[2, 28572])
Data(x=[10750, 384], edge_index=[2, 14286], y=[10750, 1], train_mask=[10750], val_mask=[10750], test_mask=[10750], edge_label=[9522], edge_label_index=[2, 9522])
Data(x=[10750, 384], edge_index=[2, 19047], y=[10750, 1], train_mask=[10750], val_mask=[10750], test_mask=[10750], edge_label=[9522], edge_label_index=[2, 9522])


<h3>Parameters for GCNConv model</h3>

In [25]:
input_dim = graph_data.num_node_features
hidden_dim = 16
output_dim = 1


<h3>GCNConv model to evaluate twitter data</h3>

In [None]:
# Define the Graph Neural Network model
class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)
    
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return x.view(-1)

In [80]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_data = train_data.to(device)
val_data = val_data.to(device)
test_data = test_data.to(device)
model = GCN(input_dim, hidden_dim, output_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
loss_fn = torch.nn.BCEWithLogitsLoss()  # Binary classification loss

AttributeError: 'Series' object has no attribute 'to'

<h3>Training function, calculating loss</h3>

In [28]:
# Training function
def train():
    model.train()
    optimizer.zero_grad()
    out = model(train_data.x, train_data.edge_index)
    loss = loss_fn(out[train_data.train_mask], train_data.y[train_data.train_mask].squeeze().float())
    # loss = F.nll_loss(out[train_data.train_mask], train_data.y[train_data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

<h3>Evaluating model and calculating Accuracy, precision, recall and f1 score</h3>

In [29]:
def evaluate(data):
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():  # Disable gradient computation
        
        # Forward pass
        logits = model(data.x, data.edge_index)
        
        # Convert logits to probabilities using sigmoid
        probs = torch.sigmoid(logits)
        
        # Convert probabilities to binary predictions (0 or 1)
        preds = (probs > 0.5).cpu().numpy()
        
        preds = preds[data.test_mask.cpu().numpy()]

        y_true = data.y[data.test_mask].squeeze().cpu().numpy()        
        # Compute accuracy
        acc = accuracy_score(y_true, preds)
        precision = precision_score(y_true, preds, average="binary")
        recall = recall_score(y_true, preds, average="binary")
        f1 = f1_score(y_true, preds, average="binary")
        return acc, precision, recall, f1

<h3>Training for multiple epochs</h3>

In [30]:
epochs = 200
best_val_acc = 0
patience = 10  # Stop if validation accuracy does not improve for 10 epochs
wait = 0
best_model_state = None
# Training loop
for epoch in range(200):
    loss = train()
    train_acc, _, _, _ = evaluate(train_data)
    val_acc, _, _, _ = evaluate(val_data)
    test_acc, _, _, _ = evaluate(test_data)

    # Early stopping
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_model_state = model.state_dict()
        wait = 0
    else:
        wait += 1
        if wait >= patience:
            print(f"Early stopping at epoch {epoch}. Best validation accuracy: {best_val_acc:.4f}")
            break

    if epoch % 20 == 0:
        result = evaluate(test_data)
        print(f'Epoch {epoch}, Loss: {loss:.4f}, Test Accuracy: {result[0]:.4f}')

# Final evaluation
acc, precision, recall, f1 = evaluate(test_data)
print(f'Final Test Accuracy: {acc:.4f}')
print(f'Final Test Precision: {precision:.4f}')
print(f'Final Test Recall: {recall:.4f}')
print(f'Final Test F1: {f1:.4f}')

Epoch 0, Loss: 0.6954, Test Accuracy: 0.5758
Epoch 20, Loss: 0.5910, Test Accuracy: 0.6637
Epoch 40, Loss: 0.5683, Test Accuracy: 0.6693
Epoch 60, Loss: 0.5610, Test Accuracy: 0.6716
Early stopping at epoch 71. Best validation accuracy: 0.6726
Final Test Accuracy: 0.6693
Final Test Precision: 0.6757
Final Test Recall: 0.6170
Final Test F1: 0.6450


<h3>Class for GAT model</h3>

In [31]:
# Define GAT Model for Binary Classification
class GAT(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, heads=8):
        super(GAT, self).__init__()
        # First GAT layer (multi-head attention)
        self.conv1 = GATConv(in_channels, hidden_channels, heads=heads, dropout=0.6)
        # Second GAT layer (single-head for binary output)
        self.conv2 = GATConv(hidden_channels * heads, 1, heads=1, concat=False, dropout=0.6)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.elu(x)  # Use ELU activation function
        x = self.conv2(x, edge_index)
        return x.view(-1)  # Output raw logits (no sigmoid here)


<h3>Parameters for GAT model</h3>

In [32]:
input_dim = train_data.num_node_features
hidden_dim = 64

In [33]:
# Initialize GAT model
model = GAT(in_channels=input_dim, hidden_channels=hidden_dim).to(device)

# Define optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
loss_fn = torch.nn.BCEWithLogitsLoss()

<h3>Training for multiple epochs</h3>

In [34]:
epochs = 200
best_val_acc = 0
patience = 10  # Stop if validation accuracy does not improve for 10 epochs
wait = 0
best_model_state = None
# Training loop
for epoch in range(200):
    loss = train()
    train_acc, _, _, _ = evaluate(train_data)
    val_acc, _, _, _ = evaluate(val_data)
    test_acc, _, _, _ = evaluate(test_data)

    # Early stopping
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_model_state = model.state_dict()
        wait = 0
    else:
        wait += 1
        if wait >= patience:
            print(f"Early stopping at epoch {epoch}. Best validation accuracy: {best_val_acc:.4f}")
            break

    if epoch % 20 == 0:
        result = evaluate(test_data)
        print(f'Epoch {epoch}, Loss: {loss:.4f}, Test Accuracy: {result[0]:.4f}')

# Final evaluation
acc, precision, recall, f1 = evaluate(test_data)
print(f'Final Test Accuracy: {acc:.4f}')
print(f'Final Test Precision: {precision:.4f}')
print(f'Final Test Recall: {recall:.4f}')
print(f'Final Test F1: {f1:.4f}')

Epoch 0, Loss: 0.6954, Test Accuracy: 0.5767
Epoch 20, Loss: 0.6508, Test Accuracy: 0.6693
Early stopping at epoch 37. Best validation accuracy: 0.6749
Final Test Accuracy: 0.6651
Final Test Precision: 0.7051
Final Test Recall: 0.5368
Final Test F1: 0.6095


In [35]:
from sklearn.model_selection import train_test_split
from transformers import BertModel, BertTokenizer, Trainer, TrainingArguments
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch.nn as nn

In [36]:
# Split the data into train and temp datasets with a ratio of 6:4
train_data, temp_data, train_labels, temp_labels = train_test_split(
    twitter_data['feature'], label_encoded, test_size=0.4, random_state=seed, stratify=label_encoded
)

# Split the temp dataset into validation and test datasets with a ratio of 2:2
val_data, test_data, val_labels, test_labels = train_test_split(
    temp_data, temp_labels, test_size=0.5, random_state=seed, stratify=temp_labels
)

print(f'Train data size: {len(train_data)}')
print(f'Validation data size: {len(val_data)}')
print(f'Test data size: {len(test_data)}')

Train data size: 6450
Validation data size: 2150
Test data size: 2150


In [37]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [38]:
def tokenizer_function(texts):
    return tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length = 128,
        return_tensors='pt'  
    )

In [39]:
train_encodings = tokenizer_function(train_data.tolist())
val_encodings = tokenizer_function(val_data.tolist())
test_encodings = tokenizer_function(test_data.tolist())

In [40]:
class GenderDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels).float()

    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }

In [41]:
train_dataset = GenderDataset(train_encodings, train_labels.tolist())
val_dataset = GenderDataset(val_encodings, val_labels.tolist())
test_dataset = GenderDataset(test_encodings, test_labels.tolist())


In [65]:
class CustomBertModel(nn.Module):
    def __init__(self, model_name):
        super(CustomBertModel, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.classifier = nn.Linear(self.bert.config.hidden_size, 1)  # Binary classification
        self.loss_fn = nn.BCEWithLogitsLoss()  # Binary Cross Entropy with Logits Loss

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        logits = self.classifier(outputs.last_hidden_state[:, 0, :])  # Use [CLS] token representation
        loss = None
        if labels is not None:
            labels = labels.float().unsqueeze(1)  # Ensure labels are of shape (batch_size, 1)
            loss = self.loss_fn(logits, labels)
        return {'loss': loss, 'logits': logits}

model_name = 'bert-base-uncased'
model = CustomBertModel(model_name).to(device)


In [43]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = (torch.sigmoid(torch.tensor(logits)) > 0.5).numpy().astype(int)  # Convert logits to binary predictions
    labels = labels.astype(int)

    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")

    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

In [70]:
training_args = TrainingArguments(
    output_dir="./bertResults",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",
)



In [71]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [72]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5929,0.586744,0.685116,0.725395,0.569656,0.638161
2,0.4837,0.648965,0.671628,0.6368,0.759542,0.692776
3,0.3163,0.770591,0.69907,0.716289,0.633588,0.672405
4,0.1872,1.03945,0.694419,0.679522,0.706107,0.69256
5,0.1459,1.242801,0.695814,0.69428,0.671756,0.682832


TrainOutput(global_step=2020, training_loss=0.357210906543354, metrics={'train_runtime': 803.9622, 'train_samples_per_second': 40.114, 'train_steps_per_second': 2.513, 'total_flos': 0.0, 'train_loss': 0.357210906543354, 'epoch': 5.0})

In [73]:
test_results = trainer.evaluate(test_dataset)
print(f'Test Results: {test_results}')

Test Results: {'eval_loss': 0.7702117562294006, 'eval_accuracy': 0.6911627906976744, 'eval_precision': 0.7042553191489361, 'eval_recall': 0.6316793893129771, 'eval_f1': 0.6659959758551308, 'eval_runtime': 17.9661, 'eval_samples_per_second': 119.67, 'eval_steps_per_second': 7.514, 'epoch': 5.0}


In [78]:
def predict(text):
    inputs = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt").to(device)
    model.eval()
    with torch.no_grad():
        logits = model(**inputs)['logits']
        probs = torch.sigmoid(logits)
        return {"Male": 1 - probs.item(), "Female": probs.item()}

In [77]:
example_text = "I love watching football and playing video games."
prediction = predict(example_text)
print(f"Predicted Gender Probabilities: {prediction}")

tensor([[0.5931]], device='cuda:0')
Predicted Gender Probabilities: {'Male': 0.4068903923034668, 'Female': 0.5931096076965332}
