<h3>Importing Necessary Packages</h3>

In [6]:
import re
import os
import csv
import torch
import random
import string
import numpy as np
import unicodedata
import contractions
import pandas as pd
from io import StringIO
from torch_geometric.data import Data
from sklearn.preprocessing import LabelEncoder
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from torch_geometric.transforms import RandomNodeSplit, RandomLinkSplit
from torch_geometric.nn import GCNConv, GATConv,GraphSAGE
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

<h3>Importing raw twitter data</h3>

In [7]:
TWITTER_RAW_DATA = os.path.join(os.getcwd(), "TwitterDataset.csv")
TWITTER_RAW_DATA

'd:\\Projects\\Machine Learning\\Few-Shot-GNN-LLM\\data-related\\TwitterDataset.csv'

<h3>Text regularization</h3>

In [8]:
def text_regularization(text):
    try:
        text = str(text)
        # Convert to lowercase
        text = text.lower()

        # Expand contractions, can't => cannot
        text = contractions.fix(text)

        # Remove punctuations
        text = text.translate(str.maketrans('', '', string.punctuation))

        # Remove special characters
        text = re.sub(r'[^a-z\s]', '', text)

        # Normalize accented characters "café" → "cafe"
        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

        # Remove extra white spaces
        text = " ".join(text.split())
        return text
    except Exception as e:
        print('text_regularization', e)
        

<h3>Preprocessing raw twitter data to find features and labels</h3>

In [9]:
def preprocess_data(dataset_path = TWITTER_RAW_DATA):
    try:
        twitter_raw_data = pd.read_csv(dataset_path, encoding="ISO-8859-1")
        twitter_data = twitter_raw_data[twitter_raw_data["gender:confidence"] >= 0.5]
        columns_to_keep = ['gender', 'description']
        columns_to_drop = [col for col in twitter_data if col not in columns_to_keep]
        twitter_data = twitter_data.drop(columns=columns_to_drop)
        twitter_data = twitter_data.dropna()
        twitter_data = twitter_data[twitter_data['gender'].isin(['male', 'female'])]
        feature = twitter_data['description']
        feature = [text_regularization(each) for each in feature]
        label = twitter_data['gender'].to_list()
        return [feature, label]
    except Exception as e:
        print('preprocess_data', e)
data = preprocess_data()

<h3>Path for pre processed twitter data</>

In [10]:
TWITTER_PROCESSED_DATA = os.path.join(os.getcwd(), "TwitterProcessedDataset.csv")

<h3>Storing pre processed twitter data</h3>

In [11]:
df = pd.DataFrame(data[0], columns=['feature'])
df['label'] = data[1]
df.to_csv(TWITTER_PROCESSED_DATA, index=False)

<h3>Reading pre processed twitter data</h3>

In [12]:
twitter_data = pd.read_csv(TWITTER_PROCESSED_DATA)  
twitter_data = twitter_data.dropna()

<h3>Sentence Bert for feature embedding</h3>

In [13]:
sbert_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [14]:
feature_embeddings  = sbert_model.encode(twitter_data['feature'].tolist(), convert_to_tensor=True)

In [15]:
feature_embeddings

tensor([[-0.0030, -0.0812,  0.0170,  ...,  0.0391,  0.0038, -0.1770],
        [ 0.0254, -0.0611,  0.0312,  ..., -0.0036,  0.0272, -0.0790],
        [ 0.0242, -0.0003,  0.0522,  ..., -0.0283, -0.0319,  0.0106],
        ...,
        [-0.0991,  0.0148,  0.0329,  ...,  0.0250,  0.0063, -0.0639],
        [-0.0535, -0.0089,  0.0194,  ...,  0.1068, -0.0364,  0.0584],
        [-0.0138,  0.0522,  0.0515,  ...,  0.0227, -0.0691, -0.0231]])

<h3>Building edges using cosine similarity to generate graph</h3>

In [16]:
def build_edges(features, threshold=0.5):
    similarity_matrix = cosine_similarity(features)
    edges = []

    for i in range(len(features)):
        for j in range(i, len(features)):
            if similarity_matrix[i][j] > threshold:
                edges.append((i, j))
                edges.append((j, i))
    return edges

edges = build_edges(feature_embeddings, threshold=0.8)
edges

[(0, 0),
 (0, 0),
 (1, 1),
 (1, 1),
 (2, 2),
 (2, 2),
 (3, 3),
 (3, 3),
 (4, 4),
 (4, 4),
 (5, 5),
 (5, 5),
 (6, 6),
 (6, 6),
 (7, 7),
 (7, 7),
 (8, 8),
 (8, 8),
 (9, 9),
 (9, 9),
 (10, 10),
 (10, 10),
 (11, 11),
 (11, 11),
 (12, 12),
 (12, 12),
 (13, 13),
 (13, 13),
 (14, 14),
 (14, 14),
 (15, 15),
 (15, 15),
 (16, 16),
 (16, 16),
 (17, 17),
 (17, 17),
 (18, 18),
 (18, 18),
 (19, 19),
 (19, 19),
 (20, 20),
 (20, 20),
 (21, 21),
 (21, 21),
 (22, 22),
 (22, 22),
 (23, 23),
 (23, 23),
 (24, 24),
 (24, 24),
 (25, 25),
 (25, 25),
 (26, 26),
 (26, 26),
 (27, 27),
 (27, 27),
 (28, 28),
 (28, 28),
 (29, 29),
 (29, 29),
 (30, 30),
 (30, 30),
 (31, 31),
 (31, 31),
 (32, 32),
 (32, 32),
 (33, 33),
 (33, 33),
 (34, 34),
 (34, 34),
 (35, 35),
 (35, 35),
 (36, 36),
 (36, 36),
 (37, 37),
 (37, 37),
 (38, 38),
 (38, 38),
 (39, 39),
 (39, 39),
 (40, 40),
 (40, 40),
 (41, 41),
 (41, 41),
 (42, 42),
 (42, 42),
 (42, 4540),
 (4540, 42),
 (43, 43),
 (43, 43),
 (44, 44),
 (44, 44),
 (45, 45),
 (45, 45),
 (

<h3>Converting labels to numeric value</h3>

In [17]:
label_encoder = LabelEncoder()
label_encoded = label_encoder.fit_transform(twitter_data['label'].tolist())
label_tensor = torch.tensor(label_encoded, dtype=torch.long).unsqueeze(1)
label_tensor

tensor([[1],
        [1],
        [1],
        ...,
        [1],
        [0],
        [0]])

In [18]:
node_features = feature_embeddings
node_features

tensor([[-0.0030, -0.0812,  0.0170,  ...,  0.0391,  0.0038, -0.1770],
        [ 0.0254, -0.0611,  0.0312,  ..., -0.0036,  0.0272, -0.0790],
        [ 0.0242, -0.0003,  0.0522,  ..., -0.0283, -0.0319,  0.0106],
        ...,
        [-0.0991,  0.0148,  0.0329,  ...,  0.0250,  0.0063, -0.0639],
        [-0.0535, -0.0089,  0.0194,  ...,  0.1068, -0.0364,  0.0584],
        [-0.0138,  0.0522,  0.0515,  ...,  0.0227, -0.0691, -0.0231]])

<h3>Converting edges to edge index to make graph data</h3>

In [19]:
edge_index = torch.tensor(edges, dtype=torch.long)
edge_index

tensor([[    0,     0],
        [    0,     0],
        [    1,     1],
        ...,
        [10748, 10748],
        [10749, 10749],
        [10749, 10749]])

In [20]:
node_label = label_tensor

In [21]:
edge_index = edge_index.t().contiguous()

<h3>Making graph data to train and evaluate</h3>

In [22]:
graph_data = Data(x=node_features, edge_index=edge_index, y=node_label)
graph_data

Data(x=[10750, 384], edge_index=[2, 23808], y=[10750, 1])

<h3>Splitting data in to train, validation and test</h3>

In [23]:
seed = 42
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

splitter = RandomNodeSplit(split="train_rest", num_val=0.2, num_test=0.2)
graph_data = splitter(graph_data)


In [24]:
# Access masks for each set
train_mask = graph_data.train_mask
val_mask = graph_data.val_mask
test_mask = graph_data.test_mask


In [25]:
graph_data.edge_index

tensor([[    0,     0,     1,  ..., 10748, 10749, 10749],
        [    0,     0,     1,  ..., 10748, 10749, 10749]])

In [26]:
train_nodes = train_mask.nonzero().flatten()
val_nodes = val_mask.nonzero().flatten()
test_nodes = test_mask.nonzero().flatten()
print(train_nodes.shape, val_nodes.shape, test_nodes.shape)

torch.Size([6450]) torch.Size([2150]) torch.Size([2150])


<h3>Splitting edges to train, validation and test accourding to train, validation and test nodes</h3>

In [27]:

edge_splitter = RandomLinkSplit(num_val=0.2, num_test=0.2)
train_data, val_data, test_data = edge_splitter(graph_data)

In [28]:
print(train_data)
print(val_data)
print(test_data)

Data(x=[10750, 384], edge_index=[2, 14286], y=[10750, 1], train_mask=[10750], val_mask=[10750], test_mask=[10750], edge_label=[28572], edge_label_index=[2, 28572])
Data(x=[10750, 384], edge_index=[2, 14286], y=[10750, 1], train_mask=[10750], val_mask=[10750], test_mask=[10750], edge_label=[9522], edge_label_index=[2, 9522])
Data(x=[10750, 384], edge_index=[2, 19047], y=[10750, 1], train_mask=[10750], val_mask=[10750], test_mask=[10750], edge_label=[9522], edge_label_index=[2, 9522])


<h3>Parameters for GCNConv model</h3>

In [29]:
input_dim = graph_data.num_node_features
hidden_dim = 16
output_dim = 1


<h3>GCNConv model to evaluate twitter data</h3>

In [30]:
# Define the Graph Neural Network model
class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)
    
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return x.view(-1)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_data = train_data.to(device)
val_data = val_data.to(device)
test_data = test_data.to(device)
model = GCN(input_dim, hidden_dim, output_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
loss_fn = torch.nn.BCEWithLogitsLoss()  # Binary classification loss

False


<h3>Training function, calculating loss</h3>

In [32]:
# Training function
def train():
    model.train()
    optimizer.zero_grad()
    out = model(train_data.x, train_data.edge_index)
    loss = loss_fn(out[train_data.train_mask], train_data.y[train_data.train_mask].squeeze().float())
    # loss = F.nll_loss(out[train_data.train_mask], train_data.y[train_data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

<h3>Evaluating model and calculating Accuracy, precision, recall and f1 score</h3>

In [None]:
def evaluate(data):
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():  # Disable gradient computation
        
        # Forward pass
        logits = model(data.x, data.edge_index)
        
        # Convert logits to probabilities using sigmoid
        probs = torch.sigmoid(logits)
        
        # Convert probabilities to binary predictions (0 or 1)
        preds = (probs > 0.5).cpu().numpy()
        
        preds = preds[data.test_mask.cpu().numpy()]

        y_true = data.y[data.test_mask].squeeze().cpu().numpy()        
        # Compute accuracy
        acc = accuracy_score(y_true, preds)
        precision = precision_score(y_true, preds, average="binary")
        recall = recall_score(y_true, preds, average="binary")
        f1 = f1_score(y_true, preds, average="binary")
        return acc, precision, recall, f1

<h3>Training for multiple epochs</h3>

In [None]:
epochs = 200
best_val_acc = 0
patience = 10  # Stop if validation accuracy does not improve for 10 epochs
wait = 0
best_model_state = None
# Training loop
for epoch in range(200):
    loss = train()
    train_acc, _, _, _ = evaluate(train_data)
    val_acc, _, _, _ = evaluate(val_data)
    test_acc, _, _, _ = evaluate(test_data)

    # Early stopping
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_model_state = model.state_dict()
        wait = 0
    else:
        wait += 1
        if wait >= patience:
            print(f"Early stopping at epoch {epoch}. Best validation accuracy: {best_val_acc:.4f}")
            break

    if epoch % 20 == 0:
        result = evaluate()
        print(f'Epoch {epoch}, Loss: {loss:.4f}, Test Accuracy: {result[0]:.4f}')

# Final evaluation
acc, precision, recall, f1 = evaluate()
print(f'Final Test Accuracy: {acc:.4f}')
print(f'Final Test Precision: {precision:.4f}')
print(f'Final Test Recall: {recall:.4f}')
print(f'Final Test F1: {f1:.4f}')

Epoch 0, Loss: 0.6948, Test Accuracy: 0.5837
Epoch 20, Loss: 0.5922, Test Accuracy: 0.6642
Epoch 40, Loss: 0.5719, Test Accuracy: 0.6684
Epoch 60, Loss: 0.5614, Test Accuracy: 0.6679
Epoch 80, Loss: 0.5516, Test Accuracy: 0.6726
Epoch 100, Loss: 0.5405, Test Accuracy: 0.6772
Epoch 120, Loss: 0.5292, Test Accuracy: 0.6786
Epoch 140, Loss: 0.5243, Test Accuracy: 0.6800
Epoch 160, Loss: 0.5182, Test Accuracy: 0.6800
Epoch 180, Loss: 0.5080, Test Accuracy: 0.6735
Final Test Accuracy: 0.6753
Final Test Precision: 0.6862
Final Test Recall: 0.6141
Final Test F1: 0.6482


<h3>Class for GAT model</h3>

In [35]:
# Define GAT Model for Binary Classification
class GAT(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, heads=8):
        super(GAT, self).__init__()
        # First GAT layer (multi-head attention)
        self.conv1 = GATConv(in_channels, hidden_channels, heads=heads, dropout=0.6)
        # Second GAT layer (single-head for binary output)
        self.conv2 = GATConv(hidden_channels * heads, 1, heads=1, concat=False, dropout=0.6)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.elu(x)  # Use ELU activation function
        x = self.conv2(x, edge_index)
        return x.view(-1)  # Output raw logits (no sigmoid here)


<h3>Parameters for GAT model</h3>

In [36]:
input_dim = train_data.num_node_features
hidden_dim = 64

In [37]:
# Initialize GAT model
model = GAT(in_channels=input_dim, hidden_channels=hidden_dim).to(device)

# Define optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
loss_fn = torch.nn.BCEWithLogitsLoss()

<h3>Training for multiple epochs</h3>

In [38]:
# Training loop
for epoch in range(200):
    loss = train()
    if epoch % 20 == 0:
        result = evaluate()
        print(f'Epoch {epoch}, Loss: {loss:.4f}, Test Accuracy: {result[0]:.4f}')

# Final evaluation
acc, precision, recall, f1 = evaluate()
print(f'Final Test Accuracy: {acc:.4f}')
print(f'Final Test Precision: {precision:.4f}')
print(f'Final Test Recall: {recall:.4f}')
print(f'Final Test F1: {f1:.4f}')

Epoch 0, Loss: 0.6947, Test Accuracy: 0.5953
Epoch 20, Loss: 0.6500, Test Accuracy: 0.6665
Epoch 40, Loss: 0.6492, Test Accuracy: 0.6619
Epoch 60, Loss: 0.6489, Test Accuracy: 0.6628
Epoch 80, Loss: 0.6482, Test Accuracy: 0.6670
Epoch 100, Loss: 0.6477, Test Accuracy: 0.6647
Epoch 120, Loss: 0.6523, Test Accuracy: 0.6660
Epoch 140, Loss: 0.6485, Test Accuracy: 0.6647
Epoch 160, Loss: 0.6475, Test Accuracy: 0.6674
Epoch 180, Loss: 0.6439, Test Accuracy: 0.6670
Final Test Accuracy: 0.6712
Final Test Precision: 0.7088
Final Test Recall: 0.5511
Final Test F1: 0.6201


In [91]:
import torch.nn as nn
import torch.optim as optim 
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.model_selection import train_test_split

In [40]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [41]:
tokens = tokenizer(twitter_data['feature'].tolist(), padding=True, truncation=True, return_tensors='pt')


In [42]:
label_tensor

tensor([[1],
        [1],
        [1],
        ...,
        [1],
        [0],
        [0]])

In [43]:
dataset = TensorDataset(tokens["input_ids"], tokens["attention_mask"], label_tensor)

In [44]:
generator = torch.Generator().manual_seed(seed)


In [45]:
train_dataset, val_dataset, test_dataset = random_split(dataset, [0.6, 0.2, 0.2], generator=generator)

In [46]:
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=True)

In [52]:
class BertGenderClassifier(nn.Module):
    def __init__(self):
        super(BertGenderClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(0.3)  # Regularization
        self.fc = nn.Linear(self.bert.config.hidden_size, 1)  # Binary classification (male/female)
        self.activation = nn.Sigmoid()  # Sigmoid activation
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  # [batch_size, hidden_size]
        x = self.dropout(pooled_output)
        x = self.fc(x)
        x = self.activation(x)
        return x

# Initialize model
model = BertGenderClassifier()

In [65]:
def train_model(model, train_loader, val_loader, epochs=4):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=2e-5)

    for epoch in range(epochs):
        model.train()
        total_loss, total_correct, processed_data = 0, 0, 0
        
        for input_ids, attention_mask, labels in train_loader:
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels.float())
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            total_correct += (outputs.argmax(dim=1) == labels).sum().item()
            processed_data += len(input_ids)
            print(f'Data processed: {processed_data} in total data: {len(train_loader)}')

        train_acc = total_correct / len(train_loader.dataset)
        val_acc = evaluate(model, val_loader, device)

        print(f"Epoch {epoch+1}: Loss={total_loss:.4f}, Train Acc={train_acc:.4f}, Val Acc={val_acc:.4f}")

def evaluate(model, loader, device):
    model.eval()
    correct = 0
    with torch.no_grad():
        for input_ids, attention_mask, labels in loader:
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            outputs = model(input_ids, attention_mask)
            correct += (outputs.argmax(dim=1) == labels).sum().item()
    return correct / len(loader.dataset)

# Train the model
train_model(model, train_dataloader, val_dataloader, epochs=4)


Data processed: 8 in total data: 807
Data processed: 16 in total data: 807
Data processed: 24 in total data: 807
Data processed: 32 in total data: 807
Data processed: 40 in total data: 807
Data processed: 48 in total data: 807
Data processed: 56 in total data: 807
Data processed: 64 in total data: 807
Data processed: 72 in total data: 807
Data processed: 80 in total data: 807
Data processed: 88 in total data: 807
Data processed: 96 in total data: 807
Data processed: 104 in total data: 807
Data processed: 112 in total data: 807
Data processed: 120 in total data: 807
Data processed: 128 in total data: 807
Data processed: 136 in total data: 807
Data processed: 144 in total data: 807
Data processed: 152 in total data: 807
Data processed: 160 in total data: 807
Data processed: 168 in total data: 807
Data processed: 176 in total data: 807
Data processed: 184 in total data: 807
Data processed: 192 in total data: 807
Data processed: 200 in total data: 807
Data processed: 208 in total data: 807

KeyboardInterrupt: 

In [66]:
bert_model = BertModel.from_pretrained('bert-base-uncased', num_labels=2) 

In [93]:
model = model.to(device)

In [95]:
from transformers import TFBertForSequenceClassification, Trainer, TFTrainingArguments


In [96]:
training_args = TFTrainingArguments(
    output_dir='./results',          
    num_train_epochs=7,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=64,   
    warmup_steps=500,                
    weight_decay=1e-5,               
    logging_dir='./logs',            
    eval_steps=100                   
)

In [97]:
trainer = Trainer(
    model=model,                 
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=val_dataset,            
)

AttributeError: 'NoneType' object has no attribute 'to_dict'