In [3]:
import os
import re
import torch
import random
import string
import unicodedata
import contractions
import numpy as np
import pandas as pd
import geopandas as gpd
from io import StringIO
import torch.nn.functional as F
from torch_geometric.data import Data
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from  sentence_transformers import SentenceTransformer
from torch_geometric.nn import GCNConv, GATConv,GraphSAGE
from torch_geometric.transforms import RandomNodeSplit, RandomLinkSplit
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
US_regions = {
    "Northeast": ["Maine", "New Hampshire", "Vermont", "Massachusetts", "Rhode Island", "Connecticut", "New York", "New Jersey", "Pennsylvania"],
    "Midwest": ["Ohio", "Indiana", "Illinois", "Michigan", "Wisconsin", "Missouri", "North Dakota", "South Dakota", "Nebraska", "Kansas"],
    "South": ["Delaware", "Maryland", "Virginia", "West Virginia", "North Carolina", "South Carolina", "Georgia", "Florida", "Kentucky", "Tennessee", "Alabama", "Mississippi", "Arkansas", "Louisiana", "Oklahoma", "Texas"],
    "West": ["Montana", "Wyoming", "Colorado", "New Mexico", "Arizona", "Utah", "Idaho", "Nevada", "Washington", "Oregon", "California", "Alaska", "Hawaii"]
}

In [5]:
GEOTEXTDATA_PATH = os.path.join(os.getcwd(), 'GeoText.csv')
GEOTEXT_RAW_DATA_PATH = os.path.join(os.getcwd(), 'GeoText_raw.csv')
print(f'Raw data path: {GEOTEXT_RAW_DATA_PATH}')
print(f'Processed data path: {GEOTEXTDATA_PATH}')

Raw data path: d:\Projects\Machine Learning\Few-Shot-GNN-LLM\data-related\GeoText_raw.csv
Processed data path: d:\Projects\Machine Learning\Few-Shot-GNN-LLM\data-related\GeoText.csv


In [6]:
def text_regularization(text):
    try:
        text = str(text)
        # Convert to lowercase
        text = text.lower()

        # Expand contractions, can't => cannot
        text = contractions.fix(text)

        # Remove punctuations
        text = text.translate(str.maketrans('', '', string.punctuation))

        # Remove special characters
        text = re.sub(r'[^a-z\s]', '', text)

        # Normalize accented characters "café" → "cafe"
        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

        # Remove extra white spaces
        text = " ".join(text.split())
        return text
    except Exception as e:
        print('text_regularization', e)
        

In [7]:
class GeoTextDataExtractor:
    def __init__(self):
        try:
            print("Instance Created")
            us_states = gpd.read_file(os.path.join('ne_110m_admin_1_states_provinces', 'ne_110m_admin_1_states_provinces.shp'))
            self.us_states = us_states[us_states['admin'] == "United States of America"]
            self.df = ''
            self.region_bounds_range = {
                "Northeast": self.get_region_lat_long_range("Northeast"),
                "Midwest": self.get_region_lat_long_range("Midwest"),
                "South": self.get_region_lat_long_range("South"),
                "West": self.get_region_lat_long_range("West"),
            }
            self.feature = []
            self.label = []
        except Exception as e:
            print("error", e)
    
    def get_region_lat_long_range(self, region_name):
        try:
            if region_name not in US_regions:
                print(f"Region '{region_name}' not found")
                return

            region_states = US_regions[region_name]
            region_bounds = {"lat_min": float('inf'), "lat_max": float('-inf'), "lon_min": float('inf'), "lon_max": float('-inf')}

            for state_name in region_states:
                state = self.us_states[self.us_states['name'] == state_name]

                if state.empty:
                    return

                minx, miny, maxx, maxy = state.geometry.bounds.iloc[0]

                # Update the region bounding box
                region_bounds["lat_min"] = min(region_bounds["lat_min"], miny)
                region_bounds["lat_max"] = max(region_bounds["lat_max"], maxy)
                region_bounds["lon_min"] = min(region_bounds["lon_min"], minx)
                region_bounds["lon_max"] = max(region_bounds["lon_max"], maxx)

            return region_bounds
        except Exception as e:
            print("error", e)

    def compute_region(self, coord):
        try:
            for region_name, bounding_range in self.region_bounds_range.items():
                latitude, longitude = coord
                lat_min = bounding_range['lat_min']
                lat_max = bounding_range['lat_max']
                lon_min = bounding_range['lon_min']
                lon_max = bounding_range['lon_max']

                if lat_min<=latitude<=lat_max and lon_min<=longitude<=lon_max:
                    return region_name
            return "Unknown"
        except Exception as e:
            print("error", e)
    
    def geo_text_data_extractor(self):
        try:
            # with open(os.path.join(os.getcwd(), 'GeoText_raw.csv'), "r", encoding="ISO-8859-1") as data_source:
            #     first_raw_data = data_source.readline()
            #     self.df = pd.read_csv(StringIO(first_raw_data), sep="\t", header=None, names=["User ID", "Timestamp","Location", "Latitude", "Longitude", "Tweet Content"])
            #     self.compute_region((self.df["Latitude"][0], self.df["Longitude"][0]))
            #     raw_data = data_source.read()
            #     self.df = pd.read_csv(StringIO(raw_data), sep="\t", header=None, names=["User ID", "Timestamp","Location", "Latitude", "Longitude", "Tweet Content"])
            #     self.df["Location"] = self.df.apply(lambda row: self.compute_region((row["Latitude"], row["Longitude"])), axis = 1)
            #     self.df.to_csv(GEOTEXTDATA_PATH, index=False)
            geotext_data = pd.read_csv(GEOTEXT_RAW_DATA_PATH)
            geotext_data['location'] = [self.compute_region(each) for each in zip(geotext_data["latitude"], geotext_data["longitude"])]
            geotext_data.to_csv(GEOTEXTDATA_PATH, index=False)
        except Exception as e:
            print("error", e)

    def preprocess_data(self):
        geotext_data = pd.read_csv(GEOTEXTDATA_PATH)
        geotext_data = geotext_data.dropna()
        geotext_data = geotext_data[geotext_data['location'] != "Unknown"]
        self.feature = [text_regularization(each) for each in geotext_data["tweet"]]
        self.label = geotext_data['location']
        return {
            "feature": self.feature,
            "label": self.label.tolist()
        }

In [8]:

GeoTextDataExtractorInstance = GeoTextDataExtractor()
# data = GeoTextDataExtractorInstance.geo_text_data_extractor()
data = GeoTextDataExtractorInstance.preprocess_data()

Instance Created


In [9]:
feature = data['feature']
label = data['label']

In [10]:
sbert_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [11]:
feature_embeddings = sbert_model.encode(feature, convert_to_tensor=True)

In [12]:
len(feature_embeddings)

9455

In [13]:
def build_edges(features, threshold=0.5):
    similarity_matrix = cosine_similarity(features)
    edges = []

    for i in range(len(features)):
        for j in range(i, len(features)):
            if similarity_matrix[i][j] > threshold:
                edges.append((i, j))
                edges.append((j, i))
    return edges

edges = build_edges(feature_embeddings.to("cpu"), threshold=0.8)
edges

[(0, 0),
 (0, 0),
 (1, 1),
 (1, 1),
 (2, 2),
 (2, 2),
 (3, 3),
 (3, 3),
 (4, 4),
 (4, 4),
 (5, 5),
 (5, 5),
 (6, 6),
 (6, 6),
 (7, 7),
 (7, 7),
 (8, 8),
 (8, 8),
 (9, 9),
 (9, 9),
 (10, 10),
 (10, 10),
 (11, 11),
 (11, 11),
 (12, 12),
 (12, 12),
 (13, 13),
 (13, 13),
 (14, 14),
 (14, 14),
 (15, 15),
 (15, 15),
 (16, 16),
 (16, 16),
 (17, 17),
 (17, 17),
 (18, 18),
 (18, 18),
 (19, 19),
 (19, 19),
 (20, 20),
 (20, 20),
 (21, 21),
 (21, 21),
 (22, 22),
 (22, 22),
 (23, 23),
 (23, 23),
 (24, 24),
 (24, 24),
 (25, 25),
 (25, 25),
 (26, 26),
 (26, 26),
 (27, 27),
 (27, 27),
 (28, 28),
 (28, 28),
 (29, 29),
 (29, 29),
 (30, 30),
 (30, 30),
 (31, 31),
 (31, 31),
 (32, 32),
 (32, 32),
 (33, 33),
 (33, 33),
 (34, 34),
 (34, 34),
 (35, 35),
 (35, 35),
 (36, 36),
 (36, 36),
 (37, 37),
 (37, 37),
 (38, 38),
 (38, 38),
 (39, 39),
 (39, 39),
 (40, 40),
 (40, 40),
 (41, 41),
 (41, 41),
 (42, 42),
 (42, 42),
 (43, 43),
 (43, 43),
 (44, 44),
 (44, 44),
 (45, 45),
 (45, 45),
 (46, 46),
 (46, 46),
 (47, 

In [14]:
label_encoder = LabelEncoder()
label_encoded = label_encoder.fit_transform(label)
label_tensor = torch.tensor(label_encoded, dtype=torch.long).unsqueeze(1)
label_tensor

tensor([[3],
        [1],
        [1],
        ...,
        [2],
        [1],
        [2]])

In [15]:
for index, label in enumerate(label_encoder.classes_):
    print(f"{index}: {label}")

0: Midwest
1: Northeast
2: South
3: West


In [16]:
node_features = feature_embeddings
node_features

tensor([[-0.0571, -0.0400,  0.0816,  ..., -0.0082,  0.0355, -0.0031],
        [-0.0653, -0.0836, -0.0041,  ..., -0.0310,  0.0244, -0.0153],
        [-0.0383, -0.0308, -0.0211,  ...,  0.0403, -0.0469,  0.0323],
        ...,
        [-0.0338, -0.0603, -0.0540,  ..., -0.0078,  0.0151, -0.0263],
        [-0.0274, -0.0523,  0.0061,  ...,  0.0044,  0.0096, -0.0644],
        [-0.0379, -0.0054, -0.0246,  ...,  0.0471, -0.0181,  0.0179]],
       device='cuda:0')

In [17]:
edge_index = torch.tensor(edges, dtype=torch.long)
edge_index

tensor([[   0,    0],
        [   0,    0],
        [   1,    1],
        ...,
        [9453, 9453],
        [9454, 9454],
        [9454, 9454]])

In [18]:
node_label = label_tensor

In [19]:
edge_index = edge_index.t().contiguous()

In [20]:
graph_data = Data(x=node_features, edge_index=edge_index, y=node_label)
graph_data

Data(x=[9455, 384], edge_index=[2, 18960], y=[9455, 1])

In [21]:
seed = 42
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

splitter = RandomNodeSplit(split="train_rest", num_val=0.2, num_test=0.2)
graph_data = splitter(graph_data)


In [22]:
# Access masks for each set
train_mask = graph_data.train_mask
val_mask = graph_data.val_mask
test_mask = graph_data.test_mask


In [23]:
graph_data.edge_index

tensor([[   0,    0,    1,  ..., 9453, 9454, 9454],
        [   0,    0,    1,  ..., 9453, 9454, 9454]])

In [24]:
train_nodes = train_mask.nonzero().flatten()
val_nodes = val_mask.nonzero().flatten()
test_nodes = test_mask.nonzero().flatten()
print(train_nodes.shape, val_nodes.shape, test_nodes.shape)

torch.Size([5673]) torch.Size([1891]) torch.Size([1891])


In [25]:

edge_splitter = RandomLinkSplit(num_val=0.2, num_test=0.2)
train_data, val_data, test_data = edge_splitter(graph_data)

In [26]:
print(train_data.x)
print(val_data)
print(test_data)

tensor([[-0.0571, -0.0400,  0.0816,  ..., -0.0082,  0.0355, -0.0031],
        [-0.0653, -0.0836, -0.0041,  ..., -0.0310,  0.0244, -0.0153],
        [-0.0383, -0.0308, -0.0211,  ...,  0.0403, -0.0469,  0.0323],
        ...,
        [-0.0338, -0.0603, -0.0540,  ..., -0.0078,  0.0151, -0.0263],
        [-0.0274, -0.0523,  0.0061,  ...,  0.0044,  0.0096, -0.0644],
        [-0.0379, -0.0054, -0.0246,  ...,  0.0471, -0.0181,  0.0179]],
       device='cuda:0')
Data(x=[9455, 384], edge_index=[2, 11376], y=[9455, 1], train_mask=[9455], val_mask=[9455], test_mask=[9455], edge_label=[7584], edge_label_index=[2, 7584])
Data(x=[9455, 384], edge_index=[2, 15168], y=[9455, 1], train_mask=[9455], val_mask=[9455], test_mask=[9455], edge_label=[7584], edge_label_index=[2, 7584])


In [27]:
input_dim = graph_data.num_node_features
hidden_dim = 16
output_dim = len(label_encoder.classes_)


In [28]:
# Define the Graph Neural Network model
class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)
    
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

In [29]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_data = train_data.to(device)
val_data = val_data.to(device)
test_data = test_data.to(device)
model = GCN(input_dim, hidden_dim, output_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
loss_fn = torch.nn.CrossEntropyLoss()  # Multi Class classification loss

In [30]:
# Training function
def train(model):
    model.train()
    optimizer.zero_grad()
    out = model(train_data.x, train_data.edge_index)
    loss = loss_fn(out[train_data.train_mask], train_data.y[train_data.train_mask].squeeze())
    # loss = F.nll_loss(out[train_data.train_mask], train_data.y[train_data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

In [31]:
def evaluate(model, data):
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():  # Disable gradient computation
        
        # Forward pass
        logits = model(data.x, data.edge_index)
        
        # Convert logits to probabilities using sigmoid
        probs = torch.softmax(logits, dim=1)
        
        # Convert probabilities to binary predictions (0 or 1)
        preds = probs.argmax(dim=1).cpu().numpy()
        
        # Mask the predictions and true labels to only include test data
        preds = preds[data.test_mask.cpu().numpy()]
        y_true = data.y[data.test_mask].cpu().numpy()
        # Compute accuracy
        acc = accuracy_score(y_true, preds)
        precision = precision_score(y_true, preds, average="weighted", zero_division=1)
        recall = recall_score(y_true, preds, average="weighted")
        f1 = f1_score(y_true, preds, average="macro")
        return acc, precision, recall, f1

In [32]:
epochs = 200
best_val_acc = 0
patience = 10  # Stop if validation accuracy does not improve for 10 epochs
wait = 0
best_model_state = None
# Training loop
for epoch in range(200):
    loss = train(model)
    train_acc, _, _, _ = evaluate(model, train_data)
    val_acc, _, _, _ = evaluate(model, val_data)

    # Early stopping
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_model_state = model.state_dict()
        wait = 0
    else:
        wait += 1
        if wait >= patience:
            print(f"Early stopping at epoch {epoch}. Best validation accuracy: {best_val_acc:.4f}")
            break

    if epoch % 20 == 0:
        result = evaluate(model, test_data)
        print(f'Epoch {epoch}, Loss: {loss:.4f}, Test Accuracy: {result[0]:.4f}')

# Final evaluation
acc, precision, recall, f1 = evaluate(model, test_data)
print(f'Final Test Accuracy: {acc:.4f}')
print(f'Final Test Precision: {precision:.4f}')
print(f'Final Test Recall: {recall:.4f}')
print(f'Final Test F1: {f1:.4f}')

Epoch 0, Loss: 1.3871, Test Accuracy: 0.4120
Epoch 20, Loss: 1.2616, Test Accuracy: 0.4384
Early stopping at epoch 39. Best validation accuracy: 0.4432
Final Test Accuracy: 0.4384
Final Test Precision: 0.5649
Final Test Recall: 0.4384
Final Test F1: 0.2185


In [33]:
# Define GAT Model for Binary Classification
class GAT(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, heads=8):
        super(GAT, self).__init__()
        # First GAT layer (multi-head attention)
        self.conv1 = GATConv(input_dim, hidden_dim, heads=heads, dropout=0.6)
        # Second GAT layer (single-head for binary output)
        self.conv2 = GATConv(hidden_dim * heads, output_dim, heads=1, concat=False, dropout=0.6)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.elu(x)  # Use ELU activation function
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)


In [34]:
input_dim = train_data.num_node_features
hidden_dim = 64
output_dim = len(label_encoder.classes_)


In [35]:
# Initialize the GAT model
gat_model = GAT(input_dim, hidden_dim, output_dim).to(device)

optimizer = torch.optim.Adam(gat_model.parameters(), lr=0.005, weight_decay=5e-4)
loss_fn = torch.nn.CrossEntropyLoss()  # Multi Class classification loss

In [36]:
epochs = 200
best_val_acc = 0
patience = 10  # Stop if validation accuracy does not improve for 10 epochs
wait = 0
best_model_state = None
# Training loop
for epoch in range(200):
    loss = train(gat_model)
    train_acc, _, _, _ = evaluate(gat_model, train_data)
    val_acc, _, _, _ = evaluate(gat_model, val_data)

    # Early stopping
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_model_state = model.state_dict()
        wait = 0
    else:
        wait += 1
        if wait >= patience:
            print(f"Early stopping at epoch {epoch}. Best validation accuracy: {best_val_acc:.4f}")
            break

    if epoch % 20 == 0:
        result = evaluate(gat_model, test_data)
        print(f'Epoch {epoch}, Loss: {loss:.4f}, Test Accuracy: {result[0]:.4f}')

# Final evaluation
acc, precision, recall, f1 = evaluate(gat_model, test_data)
print(f'Final Test Accuracy: {acc:.4f}')
print(f'Final Test Precision: {precision:.4f}')
print(f'Final Test Recall: {recall:.4f}')
print(f'Final Test F1: {f1:.4f}')

Epoch 0, Loss: 1.4144, Test Accuracy: 0.4252
Early stopping at epoch 20. Best validation accuracy: 0.4400
Final Test Accuracy: 0.4363
Final Test Precision: 0.3824
Final Test Recall: 0.4363
Final Test F1: 0.2146


In [37]:
from sklearn.model_selection import train_test_split
from transformers import BertModel, BertTokenizer, Trainer, TrainingArguments, DistilBertModel, DistilBertTokenizer, RobertaModel, RobertaTokenizer, EarlyStoppingCallback
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch.nn as nn

In [38]:
# Split the data into train and temp datasets with a ratio of 6:4
train_data, temp_data, train_labels, temp_labels = train_test_split(
    feature, label_encoded, test_size=0.4, random_state=seed, stratify=label_encoded
)

# Split the temp dataset into validation and test datasets with a ratio of 2:2
val_data, test_data, val_labels, test_labels = train_test_split(
    temp_data, temp_labels, test_size=0.5, random_state=seed, stratify=temp_labels
)

print(f'Train data size: {len(train_data)}')
print(f'Validation data size: {len(val_data)}')
print(f'Test data size: {len(test_data)}')

Train data size: 5673
Validation data size: 1891
Test data size: 1891


In [39]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('bert-base-uncased')
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DistilBertTokenizer'.


In [40]:
def tokenizer_function(texts, tokenizer):
    return tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length = 128,
        return_tensors='pt'  
    )

In [41]:
train_encodings = tokenizer_function(train_data, tokenizer)
val_encodings = tokenizer_function(val_data, tokenizer)
test_encodings = tokenizer_function(test_data, tokenizer)

In [42]:
class GeoTextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels).float()

    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }

In [43]:
train_dataset = GeoTextDataset(train_encodings, train_labels.tolist())
val_dataset = GeoTextDataset(val_encodings, val_labels.tolist())
test_dataset = GeoTextDataset(test_encodings, test_labels.tolist())


In [44]:
class CustomBertModel(nn.Module):
    def __init__(self, pre_trained_model):
        super(CustomBertModel, self).__init__()
        self.bert = pre_trained_model
        self.classifier = nn.Linear(self.bert.config.hidden_size, len(label_encoder.classes_))  
        self.loss_fn = nn.CrossEntropyLoss()  # Multi Class classification loss
        
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        logits = self.classifier(outputs.last_hidden_state[:, 0, :])  # Use [CLS] token representation
        loss = None
        if labels is not None:
            labels = labels.long()
            loss = self.loss_fn(logits, labels)
        return {'loss': loss, 'logits': logits}

pre_trained_bert_model = BertModel.from_pretrained('bert-base-uncased')
model = CustomBertModel(pre_trained_bert_model).to(device)


In [45]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, zero_division=1, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }


In [46]:
training_args = TrainingArguments(
    output_dir="./geotext-bertResults",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",
    # warmup_ratio=0.06,  # Prevent aggressive weight updates early
    # gradient_accumulation_steps=2,  # Simulate larger batch without increasing memory
    # fp16=True,  # Use mixed precision training if available
)

In [47]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [48]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2354,1.235615,0.43046,0.706942,0.267365,0.20093
2,1.1991,1.227285,0.443152,0.467011,0.293244,0.249402
3,1.1239,1.269137,0.420412,0.319195,0.312894,0.304615
4,0.9912,1.30867,0.42147,0.31473,0.298766,0.285754
5,0.8364,1.461734,0.424114,0.323451,0.310608,0.302107


TrainOutput(global_step=1775, training_loss=1.0805637284399758, metrics={'train_runtime': 768.7777, 'train_samples_per_second': 73.792, 'train_steps_per_second': 4.618, 'total_flos': 0.0, 'train_loss': 1.0805637284399758, 'epoch': 5.0})

In [49]:

test_results = trainer.evaluate(test_dataset)
print(f'Test Results: {test_results}')

Test Results: {'eval_loss': 1.2189521789550781, 'eval_accuracy': 0.4537281861448969, 'eval_precision': 0.721756313861577, 'eval_recall': 0.30086623937096346, 'eval_f1': 0.2569393835760324, 'eval_runtime': 16.1602, 'eval_samples_per_second': 117.016, 'eval_steps_per_second': 7.364, 'epoch': 5.0}


In [50]:
def predict(text, model):
    inputs = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt").to(device)
    model.eval()
    with torch.no_grad():
        logits = model(**inputs)['logits']
        probs = torch.softmax(logits, dim=1)
        predicted_class = probs.argmax(dim=1).item()
        predicted_class = label_encoder.inverse_transform([predicted_class])[0]
        return predicted_class


In [51]:
example_text = "I love watching football and playing video games."
prediction = predict(example_text, model)
print(f"Predicted Gender Probabilities: {prediction}")

Predicted Gender Probabilities: Northeast


In [52]:
train_encodings_distilbert = tokenizer_function(train_data, distilbert_tokenizer)
val_encodings_distilbert = tokenizer_function(val_data, distilbert_tokenizer)
test_encodings_distilbert = tokenizer_function(test_data, distilbert_tokenizer)

In [53]:

pre_trained_distilbert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
distilbert_model = CustomBertModel(pre_trained_bert_model).to(device)

In [54]:
distilbert_training_args = TrainingArguments(
    output_dir="./geotext-distilbertResults",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",
)

In [55]:
distilbert_trainer = Trainer(
    model=distilbert_model,
    args=distilbert_training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]

)

In [56]:
distilbert_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1248,1.255912,0.453199,0.368976,0.315506,0.291192
2,0.7939,1.647996,0.407721,0.339147,0.307153,0.292397
3,0.6805,1.937828,0.39926,0.308587,0.297982,0.277604
4,0.4005,2.310466,0.395029,0.317908,0.311917,0.308353


TrainOutput(global_step=1420, training_loss=0.7571277534458, metrics={'train_runtime': 635.9689, 'train_samples_per_second': 89.202, 'train_steps_per_second': 5.582, 'total_flos': 0.0, 'train_loss': 0.7571277534458, 'epoch': 4.0})

In [57]:
distilbert_test_results = distilbert_trainer.evaluate(test_dataset)
print(f'Test Results: {distilbert_test_results}')

Test Results: {'eval_loss': 1.2512311935424805, 'eval_accuracy': 0.45690111052353255, 'eval_precision': 0.4079763558844771, 'eval_recall': 0.3224358971655647, 'eval_f1': 0.30369412156452913, 'eval_runtime': 16.34, 'eval_samples_per_second': 115.728, 'eval_steps_per_second': 7.283, 'epoch': 4.0}


In [58]:
train_encodings_roberta = tokenizer_function(train_data, roberta_tokenizer)
val_encodings_roberta = tokenizer_function(val_data, roberta_tokenizer)
test_encodings_roberta = tokenizer_function(test_data, roberta_tokenizer)

In [59]:

pre_trained_roberta_model = RobertaModel.from_pretrained('roberta-base')
roberta_model = CustomBertModel(pre_trained_roberta_model).to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [60]:
roberta_training_args = TrainingArguments(
    output_dir="./geotext-robertaResults",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",
)

In [61]:
roberta_trainer = Trainer(
    model=roberta_model,
    args=roberta_training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]

)

In [62]:
roberta_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.245,1.262962,0.414595,0.853649,0.25,0.146542
2,1.262,1.263888,0.414595,0.853649,0.25,0.146542
3,1.2914,1.261969,0.414595,0.853649,0.25,0.146542
4,1.2801,1.261093,0.414595,0.853649,0.25,0.146542


TrainOutput(global_step=1420, training_loss=1.271571818875595, metrics={'train_runtime': 639.6061, 'train_samples_per_second': 88.695, 'train_steps_per_second': 5.55, 'total_flos': 0.0, 'train_loss': 1.271571818875595, 'epoch': 4.0})

In [63]:
roberta_test_results = roberta_trainer.evaluate(test_dataset)
print(f'Test Results: {roberta_test_results}')

Test Results: {'eval_loss': 1.2629201412200928, 'eval_accuracy': 0.41459545214172394, 'eval_precision': 0.853648863035431, 'eval_recall': 0.25, 'eval_f1': 0.14654205607476636, 'eval_runtime': 16.6177, 'eval_samples_per_second': 113.794, 'eval_steps_per_second': 7.161, 'epoch': 4.0}


In [64]:
example_text = "I love watching football and playing video games."
prediction = predict(example_text, model)
print(f"Predicted Gender Probabilities: {prediction}")

Predicted Gender Probabilities: West
