In [2]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import XLNetTokenizer, XLNetForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, confusion_matrix
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [3]:
# Loading the Excel file
df = pd.read_excel('negative_sentiments.xlsx')


In [4]:
# Separating labeled and unlabeled data
predict_df = df[df['Label'].isna()]
train_df = df.dropna(subset=['Label'])


In [5]:
# Tokenizer and model loading
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=2)

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.bias', 'logits_proj.bias', 'sequence_summary.summary.weight', 'logits_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Tokenizing and preparing DataLoader for training
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }



In [7]:
# Splitting the training data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df['Cleaned_Tweet'].values,
    train_df['Label'].values,
    test_size=0.1,
    random_state=42
)


In [8]:
# Creating datasets and DataLoader for training
train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)


In [9]:
# Training the XLNET model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
epochs = 100

best_val_loss = float('inf')
patience = 5  
counter = 0

for epoch in range(epochs):
    # Training loop
    model.train()
    for batch in tqdm(train_loader, desc=f"Epoch {epoch}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()

        optimizer.step()

    # Validation loop
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validation"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            loss = torch.nn.functional.cross_entropy(logits, labels)

            val_loss += loss.item()

    # Calculating average validation loss
    avg_val_loss = val_loss / len(val_loader)

    # Checking for early stopping
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        counter = 0
    else:
        counter += 1

    if counter >= patience:
        print(f'Early stopping after {epoch + 1} epochs without improvement.')
        break

Epoch 0: 100%|█████████████████████████████████████████████████████████████████████████| 17/17 [00:10<00:00,  1.60it/s]
Validation: 100%|████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  3.25it/s]
Epoch 1: 100%|█████████████████████████████████████████████████████████████████████████| 17/17 [00:07<00:00,  2.39it/s]
Validation: 100%|████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  3.18it/s]
Epoch 2: 100%|█████████████████████████████████████████████████████████████████████████| 17/17 [00:07<00:00,  2.39it/s]
Validation: 100%|████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  3.18it/s]
Epoch 3: 100%|█████████████████████████████████████████████████████████████████████████| 17/17 [00:07<00:00,  2.39it/s]
Validation: 100%|████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  3.19it/s]
Epoch 4: 100%|██████████████████████████

Early stopping after 17 epochs without improvement.





In [11]:
# Validation of the model
model.eval()
val_preds = []
val_true = []

with torch.no_grad():
    for batch in tqdm(val_loader, desc="Validation"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        val_preds.extend(preds.cpu().numpy())
        val_true.extend(labels.cpu().numpy())


print(classification_report(val_true, val_preds))

Validation: 100%|████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  3.18it/s]

              precision    recall  f1-score   support

           0       0.94      1.00      0.97        15
           1       0.00      0.00      0.00         1

    accuracy                           0.94        16
   macro avg       0.47      0.50      0.48        16
weighted avg       0.88      0.94      0.91        16




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
# Calculating Mean Squared Error (MSE)
mse = mean_squared_error(val_true, val_preds)
print(f"Mean Squared Error (MSE): {mse:.4f}")

# Calculating Mean Absolute Error (MAE)
mae = mean_absolute_error(val_true, val_preds)
print(f"Mean Absolute Error (MAE): {mae:.4f}")

# Calculating R-squared (R2) score
r2 = r2_score(val_true, val_preds)
print(f"R-squared (R2) Score: {r2:.4f}")

Mean Squared Error (MSE): 0.0625
Mean Absolute Error (MAE): 0.0625
R-squared (R2) Score: -0.0667


# Testing the model predictions

In [13]:
# loading file with data for prediction
new_df = pd.read_excel('Labeled_Tweets.xlsx')

In [14]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        
        if self.labels is not None:
            label = int(self.labels[idx])
        else:
            label = None

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long) if label is not None else None
        }


In [15]:
# Assuming 'Cleaned_Tweet' is the column containing the text data for prediction
new_texts = new_df['Cleaned_Tweet'].values

# Tokenize and prepare DataLoader for prediction
new_dataset = CustomDataset(new_texts, labels=None, tokenizer=tokenizer, max_len=128)
new_loader = DataLoader(new_dataset, batch_size=8, shuffle=False, collate_fn=lambda x: x)

# Make predictions
model.eval()
predictions = []

with torch.no_grad():
    for batch in tqdm(new_loader, desc="Predicting"):
        input_ids = torch.stack([item['input_ids'] for item in batch]).to(device)
        attention_mask = torch.stack([item['attention_mask'] for item in batch]).to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        predictions.extend(preds.cpu().numpy())

Predicting: 100%|████████████████████████████████████████████████████████████████████████| 8/8 [00:01<00:00,  5.36it/s]


In [16]:
# Adding predictions to the DataFrame
new_df['XLNET_Prediction'] = predictions

In [17]:
new_df

Unnamed: 0,User,Tweet,Cleaned_Tweet,Negative,Neutral,Positive,Sentiment,Label,XLNET_Prediction
0,Bev,@GyAncient @nige_gallop @CCLeeFreeman @Cleeccs...,bagger amazed knew walking football result yes...,0.519025,0.442044,0.038931,negative,0,0
1,Nigel 💙,@GyAncient @BevskiMids @CCLeeFreeman @Cleeccsc...,blimey mick old dog,0.677478,0.285934,0.036588,negative,0,0
2,Nigel 💙,@BevskiMids @CCLeeFreeman @Cleeccsc @Humberbea...,wondered rd right obvs due play rare midweek g...,0.76398,0.222234,0.013785,negative,0,0
3,Iain Joseph Gorry*,@Cleeccsc @JoRobbo68 @Humberbeat @HumbersideFi...,guy tweet date wrong think,0.664363,0.322601,0.013036,negative,0,0
4,North East Lincolnshire Council,What are the biggest crime issues in North Eas...,biggest crime issue north east lincolnshire te...,0.662119,0.320535,0.017346,negative,0,0
5,South Yorkshire Fire,We spent weeks tackling a fire on Hatfield Moo...,spent week tackling fire hatfield moor despera...,0.560314,0.336407,0.103279,negative,1,1
6,Humberside Police - North East Lincolnshire,#Grimsby #Willows Attended an incident tonight...,attended incident tonight binbrook way group y...,0.889538,0.104743,0.005718,negative,1,0
7,Safer Roads Humber,A fire safety message today. With more people ...,fire safety message today people easy overload...,0.514286,0.462393,0.023321,negative,0,0
8,North Lincs Council,"Just because it's warm outside, it doesn't mea...",warm outside doesnt mean warm underwater cold ...,0.698401,0.289254,0.012345,negative,0,0
9,DC_LK1989,Unfortunately I’m going to say no...our street...,unfortunately im going say noour street would ...,0.875694,0.117654,0.006652,negative,0,0


In [18]:
new_df['XLNET_Prediction'].value_counts()

XLNET_Prediction
0    52
1     6
Name: count, dtype: int64

In [19]:
new_df['Label'].value_counts()

Label
0    46
1    12
Name: count, dtype: int64

In [20]:
# Calculating metrics
accuracy = accuracy_score(new_df['Label'], new_df['XLNET_Prediction'])
precision = precision_score(new_df['Label'], new_df['XLNET_Prediction'])
recall = recall_score(new_df['Label'], new_df['XLNET_Prediction'])
f1 = f1_score(new_df['Label'], new_df['XLNET_Prediction'])
roc_auc = roc_auc_score(new_df['Label'], new_df['XLNET_Prediction'])

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"AUC-ROC: {roc_auc}")


Accuracy: 0.8620689655172413
Precision: 0.8333333333333334
Recall: 0.4166666666666667
F1 Score: 0.5555555555555556
AUC-ROC: 0.697463768115942


In [21]:
# Creating a confusion matrix
conf_matrix = confusion_matrix(new_df['Label'], new_df['XLNET_Prediction'])

# Extracting TP, TN, FP, FN from the confusion matrix
TP = conf_matrix[1, 1]
TN = conf_matrix[0, 0]
FP = conf_matrix[0, 1]
FN = conf_matrix[1, 0]

# Calculating the number of wrong predictions
wrong_predictions = FP + FN

print(f"Number of wrong predictions: {wrong_predictions}")

Number of wrong predictions: 8


In [22]:
# Sample list of sentences
sentences = ["Had a wonderful time in hull today","there is a fire in the south street we need the your assistance @HumbersideFire", "The kids are lighting fireworks in pearson park it is really dangerous", "I see smoke coming from the paragon station", "Some teenager are jumping of the bridge into the water","There is no incident in the beverly road"]

# Tokenizing the list of sentences
inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt').to('cuda')

# Forward pass through the model
outputs = model(**inputs)

# Applying softmax to get predictions
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

# Converting predictions to numpy array
predictions = predictions.cpu().detach().numpy()

# Getting the predicted labels
predicted_labels = [np.argmax(pred) for pred in predictions]

# Getting the corresponding probabilities
probs = [pred[label] for pred, label in zip(predictions, predicted_labels)]


for sentence, label, prob in zip(sentences, predicted_labels, probs):
    print(f"Sentence: {sentence}")
    print(f"Predicted Label: {label}")
    print(f"Probability: {prob:.4f}")
    print()


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Sentence: Had a wonderful time in hull today
Predicted Label: 0
Probability: 0.9995

Sentence: there is a fire in the south street we need the your assistance @HumbersideFire
Predicted Label: 0
Probability: 0.9728

Sentence: The kids are lighting fireworks in pearson park it is really dangerous
Predicted Label: 0
Probability: 0.9803

Sentence: I see smoke coming from the paragon station
Predicted Label: 0
Probability: 0.9773

Sentence: Some teenager are jumping of the bridge into the water
Predicted Label: 0
Probability: 0.9963

Sentence: There is no incident in the beverly road
Predicted Label: 0
Probability: 0.9986

