In [1]:
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [2]:
#!pip install transformers requests beautifulsoup4 pandas numpy

In [3]:
#!pip install sentencepiece

In [2]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px
from tqdm.auto import tqdm 


from nltk.corpus import stopwords 
import re 
from collections import Counter
from string import punctuation
import nltk
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder 
from sklearn.metrics import confusion_matrix

from gensim.models import Word2Vec
from nltk.tokenize import TweetTokenizer
import tensorflow as tf 
from tensorflow import keras 
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential 
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.optimizers import Adam

lb = LabelEncoder()

In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import XLNetTokenizer, XLNetForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import classification_report
from tqdm import tqdm
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [6]:
df = pd.read_csv('twitter_training.csv')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74681 entries, 0 to 74680
Data columns (total 4 columns):
 #   Column                                                 Non-Null Count  Dtype 
---  ------                                                 --------------  ----- 
 0   2401                                                   74681 non-null  int64 
 1   Borderlands                                            74681 non-null  object
 2   Positive                                               74681 non-null  object
 3   im getting on borderlands and i will murder you all ,  73995 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [8]:
df['Positive'].unique()

array(['Positive', 'Neutral', 'Negative', 'Irrelevant'], dtype=object)

In [9]:
df = df.drop_duplicates(subset=['im getting on borderlands and i will murder you all ,'])

In [10]:
df = df.sample(frac=0.5, random_state=42)

In [11]:
df= df.rename(columns={"Borderlands":"Feature2","im getting on borderlands and i will murder you all ,":"Feature1","Positive": "labels"})

In [12]:
df["tweets"]= df["Feature1"].astype(str) +" "+ df["Feature2"].astype(str)
df= df.drop(["Feature1","Feature2"],axis=1)


In [13]:
df['tweet_len'] = [len(text.split()) for text in df.tweets]

In [14]:
df = df[~(df['tweet_len'] < 5) & ~(df['tweet_len'] > 60)]

In [15]:
df_labels = {key : value for value , key in enumerate(np.unique(df['labels']))}
df_labels

{'Irrelevant': 0, 'Negative': 1, 'Neutral': 2, 'Positive': 3}

In [16]:
# Assuming 'labels' is the column name in your DataFrame
df = df[df['labels'] != 'Irrelevant']

In [17]:
df['labels'].value_counts()

labels
Negative    9805
Positive    8738
Neutral     8080
Name: count, dtype: int64

In [18]:
df

Unnamed: 0,2401,labels,tweets,tweet_len
38713,5442,Positive,Thanks to @ Kain0025 for the raid. Thanks to @...,44
24996,4692,Neutral,How not to get bored about every damn thing in...,12
45892,11877,Neutral,This comes as Facebook faces major criticism f...,19
68307,3697,Neutral,I'and d rather a delayed game than a broken di...,12
19912,12608,Neutral,yeah its alright for fighting some raid here i...,32
...,...,...,...,...
57886,11526,Neutral,You guys are missing both not seeing my squad ...,15
14818,2938,Positive,I've been at a lot of unranked in DotA 2. (SEA...,35
1640,2688,Positive,All 3 of these are me! I wore Tannis and Hands...,14
38544,5411,Neutral,"Ouch, the Pain Zone powered by The Nuclear Arc...",22


In [19]:
# Preprocessing the tweets
def preprocess_tweet(tweet):

    # Removing the mentions
    tweet = re.sub(r'@[\w]+', '', tweet)

    # Removing hashtags
    tweet = re.sub(r'#\w+', '', tweet)
    
    # Removing URLs
    tweet = re.sub(r'https?://\S+', '', tweet)
    
    # Remove non-alphabetic characters
    tweet = re.sub(r'[^a-zA-Z\s]', '', tweet)
    
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(tweet)

    stopwords = set(nltk.corpus.stopwords.words('english'))
    tokens = [token.lower() for token in tokens if token.lower() not in stopwords]

    lemmatizer = nltk.stem.WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(tokens)

df['Cleaned_Tweet'] = df['tweets'].apply(preprocess_tweet)

In [20]:
# Initialize LabelEncoder
lb = LabelEncoder()

# Fit and transform labels
df['labels'] = lb.fit_transform(df['labels'].values)


In [21]:
df['labels'].unique()

array([2, 1, 0])

In [22]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])  # Ensure labels are already converted to numerical format
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [23]:
fraction = 0.8
torch.cuda.set_per_process_memory_fraction(fraction)

In [24]:
# Tokenizer and model
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=3)

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
# Split the training data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['Cleaned_Tweet'].values,
    df['labels'].values,
    test_size=0.1,
    random_state=42
)

In [26]:
# Create datasets and DataLoader for training
train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer)

In [27]:
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

In [28]:
# Move the model to the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
epochs = 25

for epoch in range(epochs):
    model.train()
    for batch in tqdm(train_loader, desc=f"Epoch {epoch}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()

        optimizer.step()

Epoch 0: 100%|█████████████████████████████████████████████████████████████████████| 5990/5990 [24:03<00:00,  4.15it/s]
Epoch 1: 100%|█████████████████████████████████████████████████████████████████████| 5990/5990 [23:58<00:00,  4.17it/s]
Epoch 2: 100%|█████████████████████████████████████████████████████████████████████| 5990/5990 [23:56<00:00,  4.17it/s]
Epoch 3: 100%|█████████████████████████████████████████████████████████████████████| 5990/5990 [23:57<00:00,  4.17it/s]
Epoch 4: 100%|█████████████████████████████████████████████████████████████████████| 5990/5990 [23:56<00:00,  4.17it/s]
Epoch 5: 100%|█████████████████████████████████████████████████████████████████████| 5990/5990 [23:56<00:00,  4.17it/s]
Epoch 6: 100%|█████████████████████████████████████████████████████████████████████| 5990/5990 [23:55<00:00,  4.17it/s]
Epoch 7: 100%|█████████████████████████████████████████████████████████████████████| 5990/5990 [23:56<00:00,  4.17it/s]
Epoch 8: 100%|██████████████████████████

In [29]:
# Validation
model.eval()
val_preds = []
val_true = []

with torch.no_grad():
    for batch in tqdm(val_loader, desc="Validation"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        val_preds.extend(preds.cpu().numpy())
        val_true.extend(labels.cpu().numpy())

# Print classification report
print(classification_report(val_true, val_preds))


Validation: 100%|████████████████████████████████████████████████████████████████████| 666/666 [00:53<00:00, 12.47it/s]

              precision    recall  f1-score   support

           0       0.91      0.90      0.91       987
           1       0.90      0.89      0.90       820
           2       0.88      0.91      0.89       856

    accuracy                           0.90      2663
   macro avg       0.90      0.90      0.90      2663
weighted avg       0.90      0.90      0.90      2663






In [30]:
# Save the model
model.save_pretrained('XLNET_sentiment_model')

# Save the tokenizer
tokenizer.save_pretrained('XLNET_sentiment_tokenizer')

('XLNET_sentiment_tokenizer\\tokenizer_config.json',
 'XLNET_sentiment_tokenizer\\special_tokens_map.json',
 'XLNET_sentiment_tokenizer\\spiece.model',
 'XLNET_sentiment_tokenizer\\added_tokens.json')

# XlNET Model for Risk Detection 

In [5]:
# Loading the Excel file
df_new = pd.read_excel('negative_sentiments.xlsx')

In [6]:
# Separating labeled and unlabeled data
predict_df_new = df_new[df_new['Label'].isna()]
train_df_new = df_new.dropna(subset=['Label'])

In [7]:
# Tokenizer and model loading
tokenizer_new = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model_new = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=2)

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.weight', 'sequence_summary.summary.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Tokenizing and preparing DataLoader for training
class CustomDatasetNew(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [9]:
# Splitting the training data into train and validation sets
train_texts_new, val_texts_new, train_labels_new, val_labels_new = train_test_split(
    train_df_new['Cleaned_Tweet'].values,
    train_df_new['Label'].values,
    test_size=0.1,
    random_state=42
)

In [10]:
# Creating datasets and DataLoader for training
train_dataset_new = CustomDatasetNew(train_texts_new, train_labels_new, tokenizer_new)
val_dataset_new = CustomDatasetNew(val_texts_new, val_labels_new, tokenizer_new)

train_loader_new = DataLoader(train_dataset_new, batch_size=8, shuffle=True)
val_loader_new = DataLoader(val_dataset_new, batch_size=8, shuffle=False)

In [11]:
# Training
device_new = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_new.to(device_new)

optimizer_new = torch.optim.AdamW(model_new.parameters(), lr=2e-5)
epochs_new = 100

best_val_loss_new = float('inf')
patience_new = 5  
counter_new = 0

for epoch_new in range(epochs_new):
    # Training loop
    model_new.train()
    for batch_new in tqdm(train_loader_new, desc=f"Epoch {epoch_new}"):
        input_ids_new = batch_new['input_ids'].to(device_new)
        attention_mask_new = batch_new['attention_mask'].to(device_new)
        labels_new = batch_new['label'].to(device_new)

        optimizer_new.zero_grad()

        outputs_new = model_new(input_ids_new, attention_mask=attention_mask_new, labels=labels_new)
        loss_new = outputs_new.loss
        loss_new.backward()

        optimizer_new.step()

    # Validation loop
    model_new.eval()
    val_loss_new = 0.0
    with torch.no_grad():
        for batch_new in tqdm(val_loader_new, desc="Validation"):
            input_ids_new = batch_new['input_ids'].to(device_new)
            attention_mask_new = batch_new['attention_mask'].to(device_new)
            labels_new = batch_new['label'].to(device_new)

            outputs_new = model_new(input_ids_new, attention_mask=attention_mask_new)
            logits_new = outputs_new.logits
            preds_new = torch.argmax(logits_new, dim=1)
            loss_new = torch.nn.functional.cross_entropy(logits_new, labels_new)

            val_loss_new += loss_new.item()

    # Calculating average validation loss
    avg_val_loss_new = val_loss_new / len(val_loader_new)

    # Checking for early stopping
    if avg_val_loss_new < best_val_loss_new:
        best_val_loss_new = avg_val_loss_new
        counter_new = 0
    else:
        counter_new += 1

    if counter_new >= patience_new:
        print(f'Early stopping after {epoch_new + 1} epochs without improvement.')
        break


Epoch 0:   0%|          | 0/17 [00:00<?, ?it/s]

Validation:   0%|          | 0/2 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/17 [00:00<?, ?it/s]

Validation:   0%|          | 0/2 [00:00<?, ?it/s]

Epoch 2:   0%|          | 0/17 [00:00<?, ?it/s]

Validation:   0%|          | 0/2 [00:00<?, ?it/s]

Epoch 3:   0%|          | 0/17 [00:00<?, ?it/s]

Validation:   0%|          | 0/2 [00:00<?, ?it/s]

Epoch 4:   0%|          | 0/17 [00:00<?, ?it/s]

Validation:   0%|          | 0/2 [00:00<?, ?it/s]

Epoch 5:   0%|          | 0/17 [00:00<?, ?it/s]

Validation:   0%|          | 0/2 [00:00<?, ?it/s]

Epoch 6:   0%|          | 0/17 [00:00<?, ?it/s]

Validation:   0%|          | 0/2 [00:00<?, ?it/s]

Epoch 7:   0%|          | 0/17 [00:00<?, ?it/s]

Validation:   0%|          | 0/2 [00:00<?, ?it/s]

Early stopping after 8 epochs without improvement.


In [12]:
# Validation of the model
model_new.eval()
val_preds_new = []
val_true_new = []

with torch.no_grad():
    for batch_new in tqdm(val_loader_new, desc="Validation"):
        input_ids_new = batch_new['input_ids'].to(device_new)
        attention_mask_new = batch_new['attention_mask'].to(device_new)
        labels_new = batch_new['label'].to(device_new)

        outputs_new = model_new(input_ids_new, attention_mask=attention_mask_new)
        logits_new = outputs_new.logits
        preds_new = torch.argmax(logits_new, dim=1)

        val_preds_new.extend(preds_new.cpu().numpy())
        val_true_new.extend(labels_new.cpu().numpy())

print(classification_report(val_true_new, val_preds_new))

Validation:   0%|          | 0/2 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0       0.94      1.00      0.97        15
           1       0.00      0.00      0.00         1

    accuracy                           0.94        16
   macro avg       0.47      0.50      0.48        16
weighted avg       0.88      0.94      0.91        16



In [13]:
# Calculating Mean Squared Error (MSE)
mse_new = mean_squared_error(val_true_new, val_preds_new)
print(f"Mean Squared Error (MSE): {mse_new:.4f}")

# Calculating Mean Absolute Error (MAE)
mae_new = mean_absolute_error(val_true_new, val_preds_new)
print(f"Mean Absolute Error (MAE): {mae_new:.4f}")

# Calculating R-squared (R2) score
r2_new = r2_score(val_true_new, val_preds_new)
print(f"R-squared (R2) Score: {r2_new:.4f}")


Mean Squared Error (MSE): 0.0625
Mean Absolute Error (MAE): 0.0625
R-squared (R2) Score: -0.0667


In [14]:
# Sample list of sentences
sentences_new = ["Had a wonderful time in hull today","there is a fire in the south street we need the your assistance @HumbersideFire", "The kids are lighting fireworks in pearson park it is really dangerous", "I see smoke coming from the paragon station", "Some teenager are jumping of the bridge into the water","There is no incident in the beverly road"]

# Tokenize the list of sentences
inputs_new = tokenizer_new(sentences_new, padding=True, truncation=True, return_tensors='pt').to('cuda')

# Forward pass through the model
outputs_new = model_new(**inputs_new)

# Apply softmax to get predictions
predictions_new = torch.nn.functional.softmax(outputs_new.logits, dim=-1)

# Convert predictions to numpy array
predictions_new = predictions_new.cpu().detach().numpy()

# Get the predicted labels
predicted_labels_new = [np.argmax(pred) for pred in predictions_new]

# Get the corresponding probabilities
probs_new = [pred[label] for pred, label in zip(predictions_new, predicted_labels_new)]

# Print results
for sentence_new, label_new, prob_new in zip(sentences_new, predicted_labels_new, probs_new):
    print(f"Sentence: {sentence_new}")
    print(f"Predicted Label: {label_new}")
    print(f"Probability: {prob_new:.4f}")
    print()

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Sentence: Had a wonderful time in hull today
Predicted Label: 0
Probability: 0.9510

Sentence: there is a fire in the south street we need the your assistance @HumbersideFire
Predicted Label: 0
Probability: 0.9837

Sentence: The kids are lighting fireworks in pearson park it is really dangerous
Predicted Label: 0
Probability: 0.8169

Sentence: I see smoke coming from the paragon station
Predicted Label: 0
Probability: 0.9306

Sentence: Some teenager are jumping of the bridge into the water
Predicted Label: 0
Probability: 0.9757

Sentence: There is no incident in the beverly road
Predicted Label: 0
Probability: 0.8482



In [17]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        
        if self.labels is not None:
            label = int(self.labels[idx])
        else:
            label = None

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long) if label is not None else None
        }


In [15]:
# loading file with data for prediction
new_df = pd.read_excel('Labeled_Tweets.xlsx')

In [23]:
# Assuming 'Cleaned_Tweet' is the column containing the text data for prediction
new_texts = new_df['Cleaned_Tweet'].values

# Tokenize and prepare DataLoader for prediction
new_dataset = CustomDataset(new_texts, labels=None, tokenizer=tokenizer_new, max_len=128)
new_loader = DataLoader(new_dataset, batch_size=8, shuffle=False, collate_fn=lambda x: x)

# Make predictions
model_new.eval()
predictions = []

with torch.no_grad():
    for batch in tqdm(new_loader, desc="Predicting"):
        input_ids = torch.stack([item['input_ids'] for item in batch]).to(device_new)
        attention_mask = torch.stack([item['attention_mask'] for item in batch]).to(device_new)

        outputs = model_new(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        predictions.extend(preds.cpu().numpy())

Predicting:   0%|          | 0/8 [00:00<?, ?it/s]

In [24]:
# Add predictions to the DataFrame
new_df['XLNET_Prediction'] = predictions

In [25]:
new_df

Unnamed: 0,User,Tweet,Cleaned_Tweet,Negative,Neutral,Positive,Sentiment,Label,XLNET_Prediction
0,Bev,@GyAncient @nige_gallop @CCLeeFreeman @Cleeccs...,bagger amazed knew walking football result yes...,0.519025,0.442044,0.038931,negative,0,0
1,Nigel 💙,@GyAncient @BevskiMids @CCLeeFreeman @Cleeccsc...,blimey mick old dog,0.677478,0.285934,0.036588,negative,0,0
2,Nigel 💙,@BevskiMids @CCLeeFreeman @Cleeccsc @Humberbea...,wondered rd right obvs due play rare midweek g...,0.76398,0.222234,0.013785,negative,0,0
3,Iain Joseph Gorry*,@Cleeccsc @JoRobbo68 @Humberbeat @HumbersideFi...,guy tweet date wrong think,0.664363,0.322601,0.013036,negative,0,0
4,North East Lincolnshire Council,What are the biggest crime issues in North Eas...,biggest crime issue north east lincolnshire te...,0.662119,0.320535,0.017346,negative,0,0
5,South Yorkshire Fire,We spent weeks tackling a fire on Hatfield Moo...,spent week tackling fire hatfield moor despera...,0.560314,0.336407,0.103279,negative,1,1
6,Humberside Police - North East Lincolnshire,#Grimsby #Willows Attended an incident tonight...,attended incident tonight binbrook way group y...,0.889538,0.104743,0.005718,negative,1,0
7,Safer Roads Humber,A fire safety message today. With more people ...,fire safety message today people easy overload...,0.514286,0.462393,0.023321,negative,0,0
8,North Lincs Council,"Just because it's warm outside, it doesn't mea...",warm outside doesnt mean warm underwater cold ...,0.698401,0.289254,0.012345,negative,0,0
9,DC_LK1989,Unfortunately I’m going to say no...our street...,unfortunately im going say noour street would ...,0.875694,0.117654,0.006652,negative,0,0
