
*   AHSAN ALI KHAN REG No 429834
*   M RAMZAN NIAZ REG No 401629


In [None]:
from google.colab import drive

drive.mount('/content/drive')

In [2]:
zip_file_path = '/content/drive/MyDrive/NLP Project/nlp code.zip'

destination_directory = '/content/extracted_folder'

import zipfile

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(destination_directory)


In [5]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m79.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m107.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [3]:
!wget dl.fbaipublicfiles.com/care/care_bert.pth

--2023-05-19 13:07:25--  http://dl.fbaipublicfiles.com/care/care_bert.pth
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.227.219.70, 13.227.219.10, 13.227.219.59, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.227.219.70|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 438011249 (418M) [binary/octet-stream]
Saving to: ‘care_bert.pth’


2023-05-19 13:07:43 (24.7 MB/s) - ‘care_bert.pth’ saved [438011249/438011249]



In [6]:
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.


from transformers import BertModel, BertTokenizer
import torch
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers.models.bert.modeling_bert import BertPreTrainedModel
from transformers.models.bert.configuration_bert import BertConfig

class_labels = [
    "adoring",
    "amused",
    "angered",
    "approving",
    "excited",
    "saddened",
    "scared",
]


class CAREBERT(BertPreTrainedModel):
    def __init__(self, config: BertConfig, model_load_path: str = "./care_bert.pth"):
        super().__init__(config)
        self.config = config
        self.bert = BertModel(config)

        if model_load_path is not None:
            checkpoint = torch.load(model_load_path)
            self.bert.load_state_dict(checkpoint["model_state_dict"])
            print(f"Loaded from old {model_load_path}")

        classifier_dropout = (
            config.classifier_dropout
            if config.classifier_dropout is not None
            else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fct = BCEWithLogitsLoss()
            loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


# Run predictions for a list of texts, returning a list of the list of affects predicted for each example.
def predict(
    examples: List[str], threshold: float = 0.5, model_load_path="./care_bert.pth"
) -> List[List[str]]:
    model = CAREBERT.from_pretrained(
        "bert-base-uncased",
        num_labels=7,
        model_load_path=model_load_path,
    )

    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    encoding = tokenizer(
        examples,
        padding="max_length",
        truncation=True,
        max_length=256,
        return_tensors="pt",
    )

    # forward pass
    outs = model(**encoding, return_dict=False)
    logits = outs[0]
    pred_bools = [pl > threshold for pl in logits]

    predictions = []
    for pred_bool in pred_bools:
        affects = [class_labels[i] for i in range(len(pred_bool)) if pred_bool[i]]
        predictions.append(affects)
    return predictions


if __name__ == "__main__":
    examples = ["Warriors against the Miami Heat!!!", "That was so hilarious"]
    print(predict(examples))


Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Loaded from old ./care_bert.pth


Some weights of the model checkpoint at bert-base-uncased were not used when initializing CAREBERT: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing CAREBERT from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CAREBERT from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CAREBERT were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

[['saddened'], []]


In [12]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [14]:
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
path_to_module = '/content/extracted_folder/regex_pipeline.py'
import sys

sys.path.append('/content/extracted_folder/')

import nltk

nltk.download('punkt')
from nltk.tokenize import word_tokenize



import regex_pipeline
from typing import Dict, List
from collections import Counter
import pandas as pd
import utils

# Labels posts based on if at least t comments are labeled with the same affect.
def label_posts(
    post_id_to_comment_texts: Dict[str, List[str]], t: int = 5
) -> pd.DataFrame:
    outputs = []

    for post_id, comment_texts in post_id_to_comment_texts.items():
        affects = []
        for comment_text in comment_texts:
            comment_affects = regex_pipeline.get_regex_match_all(comment_text) #TOKENIZATION
            affects.extend(comment_affects)
        affect_map = dict(Counter(affects))
        filtered_affect_map = {}
        for k, v in utils.cluster_and_filter(affect_map).items():
            if v >= t:
                filtered_affect_map[k] = v
        if len(filtered_affect_map) > 0:
            outputs.append([post_id, filtered_affect_map])
    return pd.DataFrame(outputs, columns=["post_id", "affect_map"])


if __name__ == "__main__":
    example_dict = {
        "1": ["This is so funny!!", "Cannot stop laughing at this.", "So hilarious"]
    }
    print(label_posts(example_dict, t=3))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


  post_id     affect_map
0       1  {'amused': 3}
