In [None]:
!pip install transformers pydantic
!pip install bitsandbytes
!pip install pydantic


Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.44.1


In [None]:

!pip install openai transformers accelerate pydantic

!pip install openai==0.28


import openai
from transformers import AutoTokenizer, AutoModelForCausalLM
from pydantic import BaseModel, ValidationError
import json
import torch
from getpass import getpass



Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.54.4
    Uninstalling openai-1.54.4:
      Successfully uninstalled openai-1.54.4
Successfully installed openai-0.28.0


In [None]:
#structure of expected JSON output using Pydantic
class ProcessedData(BaseModel):
    title: str
    summary: str
    key_points: list
    tags: list


In [None]:
#api call
openai_api_key = getpass("Enter your OpenAI API key: ")
openai.api_key = openai_api_key

#def openAI
def process_with_openai(raw_text):
    prompt = f"""
    You are a data processing assistant. Given the raw text, convert it into a JSON format with the following fields:
    - "title": A brief title summarizing the text.
    - "summary": A concise summary.
    - "key_points": A list of key points.
    - "tags": Relevant tags.

    Text: "{raw_text}"

    JSON:
    """
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=200,
            temperature=0.7
        )
        output = response['choices'][0]['message']['content'].strip()
        return ProcessedData.parse_raw(output)
    except ValidationError as e:
        print("Validation Error:", e)
        return None
    except openai.error.OpenAIError as e:
        print("API Error:", e)
        return None


In [13]:

raw_text = """
Cloud computing provides flexible resources over the internet, enabling on-demand access to servers, storage, and applications.
Companies adopt it for scalability and cost efficiency.
"""


print("Processing with OpenAI API...")
openai_output = process_with_openai(raw_text)
if openai_output:
    print("\nOpenAI Output:")
    print(json.dumps(openai_output.dict(), indent=4))
else:
    print("Failed to process with OpenAI.")


Processing with OpenAI API...

OpenAI Output:
{
    "title": "Benefits of Cloud Computing",
    "summary": "Cloud computing offers flexible resources over the internet for on-demand access to servers, storage, and applications. Companies use it for scalability and cost efficiency.",
    "key_points": [
        "Flexible resources over the internet",
        "On-demand access to servers, storage, and applications",
        "Scalability",
        "Cost efficiency"
    ],
    "tags": [
        "Cloud Computing",
        "Scalability",
        "Cost Efficiency"
    ]
}


In [None]:
#hf login with token
hf_token = getpass("Enter your Hugging Face token: ")

Enter your Hugging Face token: ··········


In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `Colab Access fine tune` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token 

In [14]:
import json
from transformers import pipeline

#load models
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
keyword_extractor = pipeline("feature-extraction")  #this can be used for extracting features

def local_model(text):

    title = text.split('\n')[0]

    summary = summarizer(text, max_length=100, min_length=30, do_sample=False)[0]['summary_text']

    key_points = summary.split('. ')

    tags = [word for word in summary.split() if len(word) > 3][:3]
    structured_json = {
        "title": title,
        "summary": summary,
        "key_points": key_points,
        "tags": tags
    }

    return structured_json


#input of raw data
raw_text = """Cloud computing provides flexible resources over the internet, enabling on-demand access to servers, storage, and applications.
Companies adopt it for scalability and cost efficiency."""

structured_json = extract_title_and_summary(raw_text)
print(json.dumps(structured_json, indent=4))

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
No model was supplied, defaulted to distilbert/distilbert-base-cased and revision 6ea8117 (https://huggingface.co/distilbert/distilbert-base-cased).
Using a pipeline without specifying a model name and revision in production is not recommended.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Your max_length is set to 100, but your input_length is only 35. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=17)


{
    "title": "Cloud computing provides flexible resources over the internet, enabling on-demand access to servers, storage, and applications.",
    "summary": "Companies adopt it for scalability and cost efficiency. Cloud computing provides flexible resources over the internet. It enables on-demand access to servers, storage, and applications.",
    "key_points": [
        "Companies adopt it for scalability and cost efficiency",
        "Cloud computing provides flexible resources over the internet",
        "It enables on-demand access to servers, storage, and applications."
    ],
    "tags": [
        "Companies",
        "adopt",
        "scalability"
    ]
}


# Comparision

In [24]:
import json
import os

#comparing both results
if openai_output and local_model:

    local_model_output = local_model(raw_text)

    comparison = {
        "OpenAI Output": openai_output.dict(),
        "Local Model Output": local_model_output
    }


    print("\nComparison:")
    print(json.dumps(comparison, indent=4))


    json_file_path = 'comparison_output.json'

    with open(json_file_path, 'w') as json_file:
        json.dump(comparison, json_file, indent=4)

    print(f"\nComparison saved to {json_file_path}. You can download it.")

else:
    print("Comparison not possible. One or both outputs are missing.")

Your max_length is set to 100, but your input_length is only 35. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=17)



Comparison:
{
    "OpenAI Output": {
        "title": "Benefits of Cloud Computing",
        "summary": "Cloud computing offers flexible resources over the internet for on-demand access to servers, storage, and applications. Companies use it for scalability and cost efficiency.",
        "key_points": [
            "Flexible resources over the internet",
            "On-demand access to servers, storage, and applications",
            "Scalability",
            "Cost efficiency"
        ],
        "tags": [
            "Cloud Computing",
            "Scalability",
            "Cost Efficiency"
        ]
    },
    "Local Model Output": {
        "title": "Cloud computing provides flexible resources over the internet, enabling on-demand access to servers, storage, and applications.",
        "summary": "Companies adopt it for scalability and cost efficiency. Cloud computing provides flexible resources over the internet. It enables on-demand access to servers, storage, and applications.

# Save Result