In [1]:
!pip install transformers
!pip install datasets
!pip install -q transformers accelerate



In [2]:
from datasets import load_dataset # for loading the datasets
from transformers import pipeline # loading the pipeline
import pandas as pd

In [3]:
# function for reading the dataset
def get_dataset(datasetname=None, version=None):
    """Function to get a dataset by name and version.

    Args:
        name (str): Name of the dataset.
        version (str): Version of the dataset.

    Returns:
        dataset: Loaded dataset with the specified name and version.
        Exception: Returns the exception if loading the dataset fails.
    """
    try:
        return load_dataset(datasetname, version)
    except Exception as e:
        return e


# function to display dataset as dataframe
def display_dataframe(dataset=None, name=None, nrows=None):
    """Function to display a dataframe from a dataset.

    Args:
        dataset: Dataset object containing the dataframe.
        name (str): Name of the dataframe within the dataset.
        nrows (int): Number of rows to display from the dataframe.

    Returns:
        pandas.DataFrame: DataFrame containing the specified number of rows.
        Exception: Returns the exception if an error occurs.
    """
    try:
        if nrows is None:
            return dataset[name].to_pandas()
        else:
            return dataset[name].select(range(nrows)).to_pandas()
    except Exception as e:
        return e



In [4]:
class Pipelines:
    """Class to represent a collection of pipelines for various NLP tasks."""

    def __init__(self, task=None, model=None, min_length=None, max_length=None, truncation=None,device=None):
        """Initialize Pipelines object with specified parameters.

        Args:
            task (str): Name of the NLP task (e.g., "text-generation", "sentiment-analysis").
            model (str): Name or path of the pretrained model to use for the task.
            min_length (int): Minimum length of generated sequences (for text generation tasks).
            max_length (int): Maximum length of generated sequences (for text generation tasks).
            truncation (str): Truncation strategy for input sequences.
                - "only_first": Truncate only the first sequence in a pair of input sequences.
                - "only_second": Truncate only the second sequence in a pair of input sequences.
                - "longest_first": Truncate the longer of the two sequences in a pair of input sequences.
        """
        if task != 'text-classification':
          self.pipeline_obj = pipeline(
            task=task,
            model=model,
            device=device,
            min_length=min_length,
            max_length=max_length,
            truncation=truncation,
            )
        else:
          self.pipeline_obj = pipeline(
            task=task,
            device=device,
            model=model,
            truncation=truncation,
            )


In [5]:
# Define a class named LLMTask, which inherits from Pipelines
class LLMTask(Pipelines):

    # Constructor method
    def __init__(self, task=None, model=None, min_length=None, max_length=None, truncation=None,device=None):
        # Call the constructor of the parent class using super()
        super().__init__(task, model, min_length, max_length, truncation,device)

    # Method to extract text data from a DataFrame
    def get_text(self, df=None, feature=None):
        try:
            # Return a list comprehension iterating over the specified feature column of the DataFrame
            return [data for data in df[feature]]
        except Exception as e:
            # If an exception occurs, return the exception object
            return e

    # Method to perform an operation on document data
    def operation_on_document(self, df=None, name=None,prompt=None):
        try:
            # Call the pipeline_obj method of the parent class, passing the text data extracted using get_text method
              return self.pipeline_obj(self.get_text(df, name))
        except Exception as e:
            # If an exception occurs, return the exception object
            return e


In [6]:
# Define a class for text summarization
class Summarization:

  # Constructor method
  def __init__(self, dataset_name=None, version=None, split=None, nrows=None):
    # Retrieve dataset and display it using display_dataframe function
    self.df = display_dataframe(dataset=get_dataset(datasetname=dataset_name, version=version), name=split, nrows=nrows)

  # Method to summarize text
  def summarize_text(self, task="summarization", model=None, min_length=None, max_length=None, truncation=None, featurename=None,device=None):
    try:
      # Initialize a summarizer object using LLMTask class
      summarizer = LLMTask(task=task, model=model, min_length=min_length, max_length=max_length, truncation=truncation,device=device)
      # Perform summarization operation on the document
      return summarizer.operation_on_document(df=self.df, name=featurename)
    except Exception as e:
      # Return exception if an error occurs
      return e

# Define a class for sentiment classification
class SentimentClassification:

  # Constructor method
  def __init__(self, dataset_name=None, version=None, split=None, nrows=None):
    # Retrieve dataset and display it using display_dataframe function
    self.df = display_dataframe(dataset=get_dataset(datasetname=dataset_name, version=version), name=split, nrows=nrows)

  # Method to perform sentiment analysis on text
  def text_sentiment(self, task="text-classification", model=None, min_length=None, max_length=None, truncation=None, featurename=None,device=None):
    try:
      # Initialize a sentiment analysis object using LLMTask class
      sentiment = LLMTask(task=task, model=model, min_length=min_length, max_length=max_length, truncation=truncation,device=device)
      # Perform sentiment analysis on the document
      return sentiment.operation_on_document(df=self.df, name=featurename)
    except Exception as e:
      # Return exception if an error occurs
      return e


In [7]:
# Create an instance of the Summarization class with specified dataset parameters
summarizer = Summarization(dataset_name="xsum", version='1.2.0', split='train', nrows=10)

# Call the summarize_text method of the summarizer object
# Pass model=t5-small, min_length=20, max_length=40, truncation=True, and featurename='document'
# Store the result in the variable result
result = summarizer.summarize_text(model="t5-small", min_length=20, max_length=40, truncation=True, featurename='document')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/5.76k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.24k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [8]:
result

[{'summary_text': 'the full cost of damage in Newton Stewart is still being assessed . many roads in peeblesshire remain badly affected by standing water . a flood alert remains in place across the'},
 {'summary_text': 'a fire alarm went off at the Holiday Inn in Hope Street on Saturday . guests were asked to leave the hotel . the two buses were parked side-by-side in'},
 {'summary_text': 'Sebastian Vettel will start third ahead of team-mate Kimi Raikkonen . stewards only handed Hamilton a reprimand after governing body said "n'},
 {'summary_text': 'the 67-year-old is accused of committing the offences between March 1972 and October 1989 . he denies all the charges, including two counts of indecency'},
 {'summary_text': 'a man receiving psychiatric treatment at the clinic threatened to shoot himself and others . the incident comes amid tension in Istanbul following several attacks in crowded areas .'},
 {'summary_text': 'Gregor Townsend gave a debut to powerhouse wing Taqele Naiyaravor

In [None]:
# Create an instance of the SentimentClassification class with specified dataset parameters
sentiment_classification = SentimentClassification(dataset_name="poem_sentiment", split='train', nrows=10)

# Call the text_sentiment method of the sentiment_classification object
# Pass model="nickwong64/bert-base-uncased-poems-sentiment" and featurename='verse_text'
# Store the result in the variable result
result = sentiment_classification.text_sentiment(model="nickwong64/bert-base-uncased-poems-sentiment", featurename='verse_text')


In [None]:
result

[{'label': 'positive', 'score': 0.9965937733650208},
 {'label': 'no_impact', 'score': 0.9987409710884094},
 {'label': 'negative', 'score': 0.995965838432312},
 {'label': 'mixed', 'score': 0.9687354564666748},
 {'label': 'mixed', 'score': 0.9759674668312073},
 {'label': 'mixed', 'score': 0.9665797352790833},
 {'label': 'no_impact', 'score': 0.9986388087272644},
 {'label': 'no_impact', 'score': 0.9986108541488647},
 {'label': 'negative', 'score': 0.9965572357177734},
 {'label': 'no_impact', 'score': 0.9985186457633972}]

In [5]:
# Define a class for zero-shot classification
class ZeroShotClassification(Pipelines):

    # Constructor method
    def __init__(self, task="zero-shot-classification", model=None, min_length=None, max_length=None, truncation=None):
        # Call the constructor of the parent class using super()
        super().__init__(task=task, model=model, min_length=min_length, max_length=max_length, truncation=truncation)

    # Method for categorizing articles into predefined labels
    def categorize_article(self, labels=[], article=None):
        """
        This helper function defines the categories (labels) which the model must use to label articles.
        Note that our model was NOT fine-tuned to use these specific labels,
        but it "knows" what the labels mean from its more general training.

        This function then prints out the predicted labels alongside their confidence scores.
        """
        # Call the pipeline_obj method of the parent class, passing the article and labels
        results = self.pipeline_obj(article, labels)
        # Remove the "sequence" key from the results
        del results["sequence"]
        # Convert the results to a pandas DataFrame and return it
        return pd.DataFrame(results)

    # Method for categorizing multiple articles into predefined labels
    def categorize_multiple_inputs(self, labels=[], articles=None):
        try:

            # Perform zero-shot classification for each article
            results = []
            for article in articles:
                result = self.pipeline_obj(article, labels)
                # Remove the "sequence" key from the results
                del result["sequence"]
                results.append(result)

            # Convert the results to a pandas DataFrame and return it
            df = pd.DataFrame(results).apply(pd.Series.explode)


            return df

        except Exception as e:
            # If an exception occurs, return the exception object
            return e


In [6]:
class InputText:
    # Class method to validate and return user input for text data
    @classmethod
    def input_val(cls, user_input):
        try:
            return user_input  # Return the user input if it's valid
        except Exception as e:
            return e  # Return any exception that might occur during the validation


class InputList:
    # Class method to validate and return user input for list data
    @classmethod
    def input_val(cls, user_input=[]):  # Default value for user_input is an empty list
        try:
            return user_input  # Return the user input if it's valid
        except Exception as e:
            return e  # Return any exception that might occur during the validation


class Labels:
    # Class method to return input variables
    @classmethod
    def labels(cls, input_vars=[]):  # Default value for input_vars is an empty list
        try:
            return input_vars  # Return the input variables if they are provided
        except Exception as e:
            return e  # Return any exception that might occur during the process


In [17]:
class SingleInputClassification:
    # Class for single-input classification tasks
    def __init__(self, inputs=None, labels=None):
        # Initialize instance variables with validated inputs
        self.inputs = InputText().input_val(inputs)
        self.labels = Labels().labels(labels)

    def classify(self, model_name):
        try:
            zero = ZeroShotClassification(model=model_name, min_length=40)
            # Call the categorize_article method to categorize the user input article
            # Pass the predefined candidate labels and the user input article
            return zero.categorize_article(labels=self.labels, article=self.inputs)
        except Exception as e:
            return e  # Return any exception that might occur during the classification


class MultiInputClassification:
    # Class for multi-input classification tasks
    def __init__(self, inputs=None, labels=None):
        # Initialize instance variables with validated inputs
        self.inputs = InputList().input_val(inputs)
        self.labels = Labels().labels(labels)

    def classify(self, model_name):
        try:
            zero = ZeroShotClassification(model=model_name, min_length=40)
            # Call the categorize_multiple_inputs method to categorize the user input articles
            # Pass the predefined candidate labels and the user input articles
            return zero.categorize_multiple_inputs(labels=self.labels, articles=self.inputs)
        except Exception as e:
            return e  # Return any exception that might occur during the classification


In [15]:
user_input = "NC Chief Farooq Abdullah while talking about the Lok Sabha elections said, PM Modi has right to give statements. It is up to you to decide what is right and what is wrong. The price of edible oil has skyrocketed, poverty has surged. People are struggling with electricity bills. Youths are not getting jobs. You should ask people who are suffering. Medicine prices have swelled. They talk Ram Mandir. Have they made it? Everybody donated. It is our mistake that we have kept Quran but have not read it."

# Predefined candidate labels
candidate_labels = ["politics", "finance", "sports", "science and technology", "pop culture", "breaking news"]
sv = SingleInputClassification(inputs=user_input,labels=candidate_labels)
df = sv.classify(model_name='cross-encoder/nli-deberta-v3-small')




In [16]:
df

Unnamed: 0,labels,scores
0,politics,0.253064
1,finance,0.215408
2,breaking news,0.194973
3,science and technology,0.175497
4,pop culture,0.106231
5,sports,0.054826


In [12]:
# List of user input articles related to the medical domain
user_inputs = [
    "A new study published in a medical journal suggests that a certain medication may reduce the risk of heart disease. The research involved a large-scale clinical trial conducted over several years. The results show promising outcomes for patients with a history of cardiovascular issues.",
    "Researchers have discovered a potential link between a specific gene mutation and a rare neurological disorder. The findings could lead to new treatment approaches for patients affected by the condition."
]

# Predefined candidate labels related to the medical domain
candidate_labels = ["cardiology", "oncology", "neurology", "pharmacology", "public health", "clinical trials"]

# Initialize a ZeroShotClassification object with a specific model and minimum length

mv = MultiInputClassification(inputs=user_inputs,labels=candidate_labels)
df_ = mv.classify(model_name='facebook/bart-large-mnli')



In [13]:
df_

Unnamed: 0,labels,scores
0,clinical trials,0.722382
0,cardiology,0.139267
0,pharmacology,0.093857
0,public health,0.035825
0,neurology,0.005666
0,oncology,0.003003
1,neurology,0.85598
1,public health,0.069927
1,pharmacology,0.031495
1,clinical trials,0.025321
