In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, AdamW
from torch.utils.data import Dataset, DataLoader, random_split
import torch
import pandas as pd

In [10]:
path = r'/content/drive/My Drive/tweets_df.csv'
df = pd.read_csv(path)

In [7]:
df

Unnamed: 0.1,Unnamed: 0,tweet,still_exists,in_english,sarcasm,additional_comments,antisemitism_rating,disagree_with,sentiment_rating,calling_out,is_about_the_holocaust,ihra_section
0,0,Palestinian Media Outlet Explains a Way to Bla...,True,True,False,No comments.,1,False,3,True,False,13
1,1,💛 Share This: #lol The Australian dream of kik...,True,True,False,bot and gibberish but the most likely interpre...,4,False,2,False,False,0
2,2,💢 FYI #TIL The limit of kikes arguably should ...,True,True,False,bot and gibberish but the most likely interpre...,4,False,2,False,False,0
3,3,Lithuania Drafts Law That Would Make It Imposs...,False,True,False,No comments.,3,False,3,False,False,13
4,4,@TheDruzeFRA_BEY @MENA_WATCH MENA Watch ist ja...,True,False,False,No comments.,5,False,1,False,False,10
...,...,...,...,...,...,...,...,...,...,...,...,...
1000,1000,So if an antifa member was sent to jail for th...,True,True,False,No comments.,1,False,3,True,False,13
1001,1001,🗾 Trust me on this: #lol The problem with kike...,True,True,False,bot and gibberish but the most likely interpre...,4,False,2,False,False,0
1002,1002,"But Emma, as a Jewish girl, has no issue when ...",True,True,False,not covered by IHRA: Jews and Whites are seen ...,2,True,3,False,False,13
1003,1003,"Kikes taqueria in Salem, Oregon got em",True,False,False,No comments.,3,False,3,False,False,13


In [4]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m59.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m113.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m79.9 MB/s[0m eta [36m0:00:

In [11]:
df['antisemitism_rating'] = df['antisemitism_rating'].apply(lambda x :'anti' if x >2 else 'noanti')

In [12]:
df['combined_'] = df['tweet'] + " " + df['additional_comments']

In [13]:
df_ = df[['combined_','antisemitism_rating']]

df_



Unnamed: 0,combined_,antisemitism_rating
0,Palestinian Media Outlet Explains a Way to Bla...,noanti
1,💛 Share This: #lol The Australian dream of kik...,anti
2,💢 FYI #TIL The limit of kikes arguably should ...,anti
3,Lithuania Drafts Law That Would Make It Imposs...,anti
4,@TheDruzeFRA_BEY @MENA_WATCH MENA Watch ist ja...,anti
...,...,...
1000,So if an antifa member was sent to jail for th...,noanti
1001,🗾 Trust me on this: #lol The problem with kike...,anti
1002,"But Emma, as a Jewish girl, has no issue when ...",noanti
1003,"Kikes taqueria in Salem, Oregon got em No comm...",anti


In [14]:
class HateSpeechDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = {key: torch.tensor(val) for key, val in encodings.items()}
        self.labels = torch.tensor(labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

def transform_label(label):
    if label == 'hate':
        return 1
    else:
        return 0

# Load the dataset

# Preprocessing
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

df_['antisemitism_rating'] = df_['antisemitism_rating'].apply(transform_label)

# Apply the transformations
encodings = tokenizer(df_['combined_'].tolist(), truncation=True, padding=True)
labels = df_['antisemitism_rating'].tolist()

# Define the Dataset
dataset = HateSpeechDataset(encodings, labels)

# Split the data
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

# Use the random_split function to split dataset into 2 parts of the desired length
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Use DataLoader to handle batching of the data
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=True)

# Define the model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_['antisemitism_rating'] = df_['antisemitism_rating'].apply(transform_label)


Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
from sklearn.metrics import accuracy_score

epochs = 1

# Training loop
for epoch in tqdm(range(epochs), desc="Epochs"):
    total_loss = 0
    train_true_labels = []
    train_pred_labels = []

    model.train()  # Ensure the model is in training mode
    for batch in tqdm(train_loader, desc="Training Batches"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].squeeze()
        attention_mask = batch['attention_mask'].squeeze()
        labels = batch['labels']

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Get the predicted labels by taking the argmax of the output logits
        preds = torch.argmax(outputs.logits, dim=1)
        train_true_labels += labels.tolist()
        train_pred_labels += preds.tolist()

        loss.backward()
        optimizer.step()

    train_accuracy = accuracy_score(train_true_labels, train_pred_labels)
    print(f"Epoch: {epoch+1}, Loss: {total_loss/len(train_loader)}, Training Accuracy: {train_accuracy}")

    # Validation loop
    model.eval()  # Set the model to evaluation mode
    val_true_labels = []
    val_pred_labels = []

    with torch.no_grad():  # No need to track gradients in validation mode
        for batch in tqdm(val_loader, desc="Validation Batches"):
            input_ids = batch['input_ids'].squeeze()
            attention_mask = batch['attention_mask'].squeeze()
            labels = batch['labels']

            outputs = model(input_ids, attention_mask=attention_mask)

            # Get the predicted labels by taking the argmax of the output logits
            preds = torch.argmax(outputs.logits, dim=1)
            val_true_labels += labels.tolist()
            val_pred_labels += preds.tolist()

    # Calculate the accuracy
    val_accuracy = accuracy_score(val_true_labels, val_pred_labels)
    print(f"Validation Accuracy: {val_accuracy}")


Epochs:   0%|          | 0/1 [00:00<?, ?it/s]
Training Batches:   0%|          | 0/51 [00:00<?, ?it/s][A
Training Batches:   2%|▏         | 1/51 [00:02<02:09,  2.59s/it][A
Training Batches:   4%|▍         | 2/51 [00:05<02:15,  2.76s/it][A
Training Batches:   6%|▌         | 3/51 [00:07<01:54,  2.39s/it][A
Training Batches:   8%|▊         | 4/51 [00:09<01:39,  2.11s/it][A
Training Batches:  10%|▉         | 5/51 [00:10<01:30,  1.97s/it][A
Training Batches:  12%|█▏        | 6/51 [00:12<01:24,  1.87s/it][A
Training Batches:  14%|█▎        | 7/51 [00:14<01:19,  1.80s/it][A
Training Batches:  16%|█▌        | 8/51 [00:15<01:15,  1.75s/it][A
Training Batches:  18%|█▊        | 9/51 [00:18<01:26,  2.06s/it][A
Training Batches:  20%|█▉        | 10/51 [00:20<01:21,  1.99s/it][A
Training Batches:  22%|██▏       | 11/51 [00:22<01:15,  1.90s/it][A
Training Batches:  24%|██▎       | 12/51 [00:23<01:10,  1.81s/it][A
Training Batches:  25%|██▌       | 13/51 [00:25<01:06,  1.76s/it][A
Traini

Epoch: 1, Loss: 0.012045929765365287, Training Accuracy: 1.0



Validation Batches:   0%|          | 0/13 [00:00<?, ?it/s][A
Validation Batches:   8%|▊         | 1/13 [00:00<00:06,  1.95it/s][A
Validation Batches:  15%|█▌        | 2/13 [00:01<00:05,  1.92it/s][A
Validation Batches:  23%|██▎       | 3/13 [00:01<00:05,  1.90it/s][A
Validation Batches:  31%|███       | 4/13 [00:02<00:04,  1.90it/s][A
Validation Batches:  38%|███▊      | 5/13 [00:02<00:04,  1.88it/s][A
Validation Batches:  46%|████▌     | 6/13 [00:03<00:03,  1.89it/s][A
Validation Batches:  54%|█████▍    | 7/13 [00:03<00:03,  1.87it/s][A
Validation Batches:  62%|██████▏   | 8/13 [00:04<00:02,  1.86it/s][A
Validation Batches:  69%|██████▉   | 9/13 [00:04<00:02,  1.86it/s][A
Validation Batches:  77%|███████▋  | 10/13 [00:05<00:01,  1.88it/s][A
Validation Batches:  85%|████████▍ | 11/13 [00:05<00:01,  1.88it/s][A
Validation Batches:  92%|█████████▏| 12/13 [00:06<00:00,  1.88it/s][A
Validation Batches: 100%|██████████| 13/13 [00:06<00:00,  1.94it/s]
Epochs: 100%|██████████| 1

Validation Accuracy: 1.0



