# 1. Import required libraries

In [None]:
import pandas as pd
import numpy as np
import io 

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 2. Import Dataset

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Datasets/dataset_2_2.csv', encoding='latin-1')

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,Location,Text,Sentiment,Likes,quotes,replies,retweet
0,0,NewYork,Woke criminal New York Times brings back Covid...,Negative,1.0,0.0,1.0,0.0
1,1,NewYork,He proven New York liberal narcissist hired wo...,Negative,1.0,0.0,0.0,0.0
2,2,NewYork,CBS2 News At 11 New York leader say mask vacci...,Neutral,5.0,0.0,0.0,3.0
3,3,NewYork,Pediatric COVID Deaths New York,Neutral,3.0,0.0,0.0,1.0
4,4,NewYork,The 7 day rolling average new Covid hospitaliz...,Negative,2.0,0.0,0.0,0.0


# 3. Dataset Preprocessing

In [None]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [None]:
sent_dict = {
    'Neutral':1,
    'Negative':0,
    'Positive':2
}

In [None]:
df = df[['Text','Sentiment', 'Location']]

In [None]:
df['Sentiment'].value_counts()

Neutral     144366
Negative    132071
Positive    113166
Name: Sentiment, dtype: int64

In [None]:
df['Sentiment'] = df['Sentiment'].map(sent_dict)

In [None]:
df.head()

Unnamed: 0,Text,Sentiment,Location
0,Woke criminal New York Times brings back Covid...,0,NewYork
1,He proven New York liberal narcissist hired wo...,0,NewYork
2,CBS2 News At 11 New York leader say mask vacci...,1,NewYork
3,Pediatric COVID Deaths New York,1,NewYork
4,The 7 day rolling average new Covid hospitaliz...,0,NewYork


In [None]:
df = df.rename(columns = {"Sentiment": "labels"})

In [None]:
df.dropna(inplace=True)
df.reset_index(inplace=True, drop=True)

# 4. Separate according to region - datasets

In [None]:
location_dataset = []
locations = list(df['Location'].unique())

In [None]:
locations = locations[:6]

In [None]:
for location in locations:
  data = df[df['Location']==location]
  data = data.iloc[:10000,:]
  location_dataset.append(data)

In [None]:
for location in location_dataset:
  print(type(location))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


# 5. Train-test split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_texts_list = []
train_labels_list = []
test_texts_list = []
test_labels_list = []

for location in location_dataset:
  train_texts, test_texts, train_labels, test_labels = train_test_split(location['Text'], location['labels'], test_size=.2)

  train_texts_list.append(train_texts)
  test_texts_list.append(test_texts)
  train_labels_list.append(train_labels)
  test_labels_list.append(test_labels)


In [None]:
def reseting_index(df):
  return df.reset_index(inplace=True, drop=True)

In [None]:
for df in train_texts_list:
  reseting_index(df)

In [None]:
for df in train_labels_list:
  reseting_index(df)

In [None]:
common_df = []
for df in test_texts_list:
  common_df = common_df + list(df)

In [None]:
common_df_labels = []
for df in test_labels_list:
  common_df_labels = common_df_labels + list(df)

In [None]:
test_texts = pd.DataFrame({'Text':common_df})

In [None]:
test_labels = pd.DataFrame({'labels':common_df_labels})

# 6. Import tokenizer and model

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 4.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 65.5 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 8.9 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 74.4 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 59.1 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: p

In [None]:
from transformers import AutoModel,AutoTokenizer,TFAutoModel

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def tokenize_location_datasets(train_texts_list):
  tokenized_train_texts_list = []

  for df in train_texts_list:
    tokenized_train_texts_list.append(tokenizer(list(df), truncation=True, max_length=256, padding=True))
  
  return tokenized_train_texts_list

In [None]:
test_encodings = tokenizer(test_texts['Text'].tolist(), truncation=True, max_length=256, padding=True)
train_encodings_list = tokenize_location_datasets(train_texts_list)

# 7. Setup for pytorch

In [None]:
import torch

class TweetsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, label):
        self.encodings = encodings
        self.label = label

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['label'] = torch.tensor(self.label[idx])
        return item

    def __len__(self):
        return len(self.label)


In [None]:
def gen_train_datasets(train_encodings_list, train_labels_list):
  train_datasets = []

  for i, train_encs in enumerate(train_encodings_list):
    labels = train_labels_list[i]
    train_dataset = TweetsDataset(train_encs, labels)
    train_datasets.append(train_dataset)

  return train_datasets

In [None]:
train_datasets = gen_train_datasets(train_encodings_list, train_labels_list)
test_dataset = TweetsDataset(test_encodings, test_labels['labels'].tolist())

# 8. Adapt Pre-trained BERT

In [None]:
from torch import nn
from sklearn.utils.class_weight import compute_class_weight

In [None]:
class CustomBERTModel(nn.Module):
    def __init__(self):
        super(CustomBERTModel, self).__init__()
        self.num_labels = 3

        self.bert = AutoModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, 3)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        """
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        class_weights = compute_class_weight('balanced', classes = np.unique(train_labels), y = train_labels)
        weights = torch.tensor(class_weights, dtype = torch.float)
        weights = weights.cuda()

        loss_fct = nn.CrossEntropyLoss(weight=weights)
        loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        output = (logits,) + outputs[2:]
        return ((loss,) + output) if loss is not None else output

In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW
import transformers
import math
import copy

# 9. Functions for average and training

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
batch_size = 4
cycles = 3

In [None]:
def create_n_models(train_datasets):
  num = len(train_datasets)

  models = []

  for i in range(num):
    model = CustomBERTModel()
    model = model.cuda()
    model.to(device)

    train_loader = DataLoader(train_datasets[i], batch_size=batch_size, shuffle=True)
    optim = AdamW(model.parameters(), lr=5e-5,  weight_decay=0.01)
    warm_steps = int(len(train_datasets[i])*0.1/batch_size*cycles)
    train_steps = int(len(train_datasets[i])/batch_size*cycles)
    scheduler = transformers.get_linear_schedule_with_warmup(optim, warm_steps, train_steps)

    models.append({'model': model, 'loader': train_loader, 'optim': optim, 'scheduler': scheduler, 'n_samples': len(train_datasets[i])})

  return models

In [None]:
def fedavg(models):
  params1 = models[0]['model'].named_parameters()
  resulting_params = models[0]['model'].named_parameters()
  resulting_params = copy.deepcopy(dict(resulting_params))

  for name, _ in params1:
    total = 0

    for model_dict in models:
      model = model_dict['model']
      total += dict(model.named_parameters())[name]
    
    total = total/len(train_datasets)
    resulting_params[name].data.copy_(total)

  model = CustomBERTModel()
  model = model.cuda()
  model.to(device)
  model.load_state_dict(resulting_params, strict=False)
  return model

In [None]:
def same2(m1, m2):
  m2_dict = dict(m2.named_parameters())
  for n, p in m1.named_parameters():
    if not torch.all(torch.eq(p, m2_dict[n])):
      return False
  return True

In [None]:
def update_models(global_model, models):

  for i in range(len(models)):

    m = models[i]['model']

    m.load_state_dict(copy.deepcopy(dict(global_model.named_parameters())), strict=False)
    models[i]['model'] = m
 
  return models

# 10. Evaluation code

In [None]:
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import classification_report

In [None]:
def compute_metrics(pred):
    label = pred['label_ids']
    preds = pred['predictions'].argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(label, preds, average='macro')
    acc = accuracy_score(label, preds)
    cr = classification_report(label, preds, digits=3)
    print(acc)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
def evaluate(model):
  test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)
  pred = {
          'label_ids': torch.empty(0).to(device),
          'predictions':torch.empty(0).to(device)
          }
  model.eval()

  with torch.no_grad():
    for s, batch in enumerate(test_loader):
      
      if s %300 == 0 and not s == 0:
        print('Batch {:>5,} of {:>5,}'.format(s, len(test_loader)))

      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      label = batch['label'].to(device)

      outputs = model(input_ids, attention_mask=attention_mask, labels=label)

      pred['predictions'] = torch.cat((pred['predictions'], outputs[1]), 0)
      pred['label_ids'] = torch.cat((pred['label_ids'], label), 0)

  pred['predictions'] = pred['predictions'].cpu()
  pred['label_ids'] = pred['label_ids'].cpu()

  compute_metrics(pred)

# 11. Training code

In [None]:
import time
start_time = time.time()

In [None]:
model_list = []

In [None]:
def train(cycles, epochs):
  models = create_n_models(train_datasets)
  model_list.append(models)

  for cycle in range(cycles):
    print('---------- Starting cycle {} ----------'.format(cycle+1))

    for j, model_dict in enumerate(models):
      print('---------- Training model No: {} ----------'.format(j+1))

      model = model_dict['model']
      test = copy.deepcopy(model_dict['model'])

      model.train()

      train_loader = model_dict['loader']
      optim = model_dict['optim']
      scheduler = model_dict['scheduler']

      for epoch in range(epochs):
        print('---------- Epoch No: {} ----------'.format(epoch+1))

        for s, batch in enumerate(train_loader):

          if s %300 == 0 and not s == 0:
            print('Batch {:>5,} of {:>5,}'.format(s, len(train_loader)))

          optim.zero_grad()

          input_ids = batch['input_ids'].to(device)
          attention_mask = batch['attention_mask'].to(device)
          label = batch['label'].to(device)

          outputs = model(input_ids, attention_mask=attention_mask, labels=label)
          loss = outputs[0]

          loss.backward()
          optim.step()
          scheduler.step()

    for i,m in enumerate(models):
      print('---------- Evaluating model No: {} ----------'.format(i+1))
      evaluate(m['model'])

    print('---------- Evaluating global model ----------')
    global_model = fedavg(models)
    evaluate(global_model)
    models = update_models(global_model, models)

    model_list.append(models)

    del global_model

  return models



In [None]:
final_models = train(3,5)
print("--- %s seconds ---" % (time.time() - start_time))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weig

---------- Starting cycle 1 ----------
---------- Training model No: 1 ----------
---------- Epoch No: 1 ----------
Batch   300 of 2,000
Batch   600 of 2,000
Batch   900 of 2,000
Batch 1,200 of 2,000
Batch 1,500 of 2,000
Batch 1,800 of 2,000
---------- Epoch No: 2 ----------
Batch   300 of 2,000
Batch   600 of 2,000
Batch   900 of 2,000
Batch 1,200 of 2,000
Batch 1,500 of 2,000
Batch 1,800 of 2,000
---------- Epoch No: 3 ----------
Batch   300 of 2,000
Batch   600 of 2,000
Batch   900 of 2,000
Batch 1,200 of 2,000
Batch 1,500 of 2,000
Batch 1,800 of 2,000
---------- Epoch No: 4 ----------
Batch   300 of 2,000
Batch   600 of 2,000
Batch   900 of 2,000
Batch 1,200 of 2,000
Batch 1,500 of 2,000
Batch 1,800 of 2,000
---------- Epoch No: 5 ----------
Batch   300 of 2,000
Batch   600 of 2,000
Batch   900 of 2,000
Batch 1,200 of 2,000
Batch 1,500 of 2,000
Batch 1,800 of 2,000
---------- Training model No: 2 ----------
---------- Epoch No: 1 ----------
Batch   300 of 2,000
Batch   600 of 2,000

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Batch   300 of   750
Batch   600 of   750
0.869
---------- Starting cycle 2 ----------
---------- Training model No: 1 ----------
---------- Epoch No: 1 ----------
Batch   300 of 2,000
Batch   600 of 2,000
Batch   900 of 2,000
Batch 1,200 of 2,000
Batch 1,500 of 2,000
Batch 1,800 of 2,000
---------- Epoch No: 2 ----------
Batch   300 of 2,000
Batch   600 of 2,000
Batch   900 of 2,000
Batch 1,200 of 2,000
Batch 1,500 of 2,000
Batch 1,800 of 2,000
---------- Epoch No: 3 ----------
Batch   300 of 2,000
Batch   600 of 2,000
Batch   900 of 2,000
Batch 1,200 of 2,000
Batch 1,500 of 2,000
Batch 1,800 of 2,000
---------- Epoch No: 4 ----------
Batch   300 of 2,000
Batch   600 of 2,000
Batch   900 of 2,000
Batch 1,200 of 2,000
Batch 1,500 of 2,000
Batch 1,800 of 2,000
---------- Epoch No: 5 ----------
Batch   300 of 2,000
Batch   600 of 2,000
Batch   900 of 2,000
Batch 1,200 of 2,000
Batch 1,500 of 2,000
Batch 1,800 of 2,000
---------- Training model No: 2 ----------
---------- Epoch No: 1 ----

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Batch   300 of   750
Batch   600 of   750
0.869
---------- Starting cycle 3 ----------
---------- Training model No: 1 ----------
---------- Epoch No: 1 ----------
Batch   300 of 2,000
Batch   600 of 2,000
Batch   900 of 2,000
Batch 1,200 of 2,000
Batch 1,500 of 2,000
Batch 1,800 of 2,000
---------- Epoch No: 2 ----------
Batch   300 of 2,000
Batch   600 of 2,000
Batch   900 of 2,000
Batch 1,200 of 2,000
Batch 1,500 of 2,000
Batch 1,800 of 2,000
---------- Epoch No: 3 ----------
Batch   300 of 2,000
Batch   600 of 2,000
Batch   900 of 2,000
Batch 1,200 of 2,000
Batch 1,500 of 2,000
Batch 1,800 of 2,000
---------- Epoch No: 4 ----------
Batch   300 of 2,000
Batch   600 of 2,000
Batch   900 of 2,000
Batch 1,200 of 2,000
Batch 1,500 of 2,000
Batch 1,800 of 2,000
---------- Epoch No: 5 ----------
Batch   300 of 2,000
Batch   600 of 2,000
Batch   900 of 2,000
Batch 1,200 of 2,000
Batch 1,500 of 2,000
Batch 1,800 of 2,000
---------- Training model No: 2 ----------
---------- Epoch No: 1 ----

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Batch   300 of   750
Batch   600 of   750
0.869
--- 23300.809284448624 seconds ---
