In [None]:
# Install required libraries
!pip install openai transformers accelerate pydantic
#!pip install --upgrade openai
!pip install openai==0.28

# Import libraries
import openai
from transformers import AutoTokenizer, AutoModelForCausalLM
from pydantic import BaseModel, ValidationError
import json
import torch
from getpass import getpass





In [None]:
# Define the structure of the expected JSON output using Pydantic
class ProcessedData(BaseModel):
    title: str
    summary: str
    key_points: list
    tags: list


In [None]:
# Configure OpenAI API
openai_api_key = getpass("Enter your OpenAI API key: ")
openai.api_key = openai_api_key

# Define OpenAI processing function
def process_with_openai(raw_text):
    prompt = f"""
    You are a data processing assistant. Given the raw text, convert it into a JSON format with the following fields:
    - "title": A brief title summarizing the text.
    - "summary": A concise summary.
    - "key_points": A list of key points.
    - "tags": Relevant tags.

    Text: "{raw_text}"

    JSON:
    """
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",  # Use the newer GPT-3.5 Turbo model
            messages=[{"role": "user", "content": prompt}],
            max_tokens=200,
            temperature=0.7
        )
        output = response['choices'][0]['message']['content'].strip()
        return ProcessedData.parse_raw(output)
    except ValidationError as e:
        print("Validation Error:", e)
        return None
    except openai.error.OpenAIError as e:
        print("API Error:", e)
        return None


Enter your OpenAI API key: ··········


In [None]:
# Provide raw input text
raw_text = """
Cloud computing provides flexible resources over the internet, enabling on-demand access to servers, storage, and applications.
Companies adopt it for scalability and cost efficiency.
"""

# Process input using OpenAI API
print("Processing with OpenAI API...")
openai_output = process_with_openai(raw_text)
if openai_output:
    print("\nOpenAI Output:")
    print(json.dumps(openai_output.dict(), indent=4))
else:
    print("Failed to process with OpenAI.")


Processing with OpenAI API...

OpenAI Output:
{
    "title": "Benefits of Cloud Computing",
    "summary": "Cloud computing offers flexible and on-demand access to resources such as servers, storage, and applications, leading to increased scalability and cost efficiency for companies.",
    "key_points": [
        "Flexible resources over the internet",
        "On-demand access to servers, storage, and applications",
        "Scalability and cost efficiency for companies"
    ],
    "tags": [
        "cloud computing",
        "flexible resources",
        "scalability",
        "cost efficiency"
    ]
}


In [None]:
# Configure Hugging Face Local Model
hf_token = getpass("Enter your Hugging Face token: ")

# Define local model loading function
def load_local_model(model_name="meta-llama/Llama-2-7b-chat-hf"):
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_token)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        use_auth_token=hf_token,
        device_map="auto",
        torch_dtype=torch.float16
    )
    return tokenizer, model

# Load local model
print("Loading local model...")
tokenizer, model = load_local_model()


Enter your Hugging Face token: ··········
Loading local model...




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [None]:
!pip install transformers pydantic
!pip install bitsandbytes
!pip install pydantic


Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.44.1


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from pydantic import BaseModel, ValidationError
import json
import re

# Define the Pydantic model for JSON validation
class ProcessedData(BaseModel):
    title: str
    summary: str
    key_points: list[str]
    tags: list[str]

# Load the Flan-T5 model and tokenizer
print("Loading Flan-T5 model and tokenizer...")
model_name = "google/flan-t5-small"  # Smaller and faster model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Define processing function with Flan-T5
def process_with_flan_t5(raw_text):
    prompt = f"""
    You are a data assistant. Convert the given text into a JSON object with these fields:
    - "title": A brief title summarizing the text.
    - "summary": A concise summary of the text.
    - "key_points": A list of key points extracted from the text.
    - "tags": Relevant tags for the text.

    Example format:
    {{
        "title": "example title",
        "summary": "example summary",
        "key_points": ["point1", "point2"],
        "tags": ["tag1", "tag2"]
    }}

    Text: "{raw_text}"

    JSON:
    """
    try:
        # Tokenize the input
        inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to("cpu")
        outputs = model.generate(**inputs, max_length=300, temperature=0.7, do_sample=True)

        # Decode the model output
        output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print("\nRaw Model Output:\n", output_text)  # Debugging

        # Extract JSON using regex
        json_match = re.search(r"\{.*\}", output_text, re.DOTALL)
        if not json_match:
            raise ValueError("JSON structure not found in the model output.")

        json_data = json_match.group()
        print("\nExtracted JSON:\n", json_data)  # Debugging

        # Validate the JSON structure
        return ProcessedData.parse_raw(json_data)
    except ValidationError as e:
        print("Validation Error:", e)
        return None
    except Exception as e:
        print("Processing Error:", e)
        return None

# Provide raw input text
raw_text = """
Cloud computing provides flexible resources over the internet, enabling on-demand access to servers, storage, and applications.
Companies adopt it for scalability and cost efficiency.
"""

# Process input using Flan-T5
print("Processing with Flan-T5...")
flan_t5_output = process_with_flan_t5(raw_text)
if flan_t5_output:
    print("\nFlan-T5 Output:")
    print(json.dumps(flan_t5_output.dict(), indent=4))
else:
    print("Failed to process with Flan-T5.")


Loading Flan-T5 model and tokenizer...
Processing with Flan-T5...

Raw Model Output:
 "cloud computing provides flexible resources over the internet, enabling on-demand access to servers, storage, and applications. Companies adopt it for scalability and cost efficiency."
Processing Error: JSON structure not found in the model output.
Failed to process with Flan-T5.


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from pydantic import BaseModel, ValidationError
import json

# Define the Pydantic model for JSON validation
class ProcessedData(BaseModel):
    title: str
    summary: str
    key_points: list[str]
    tags: list[str]

# Load the LLaMA model and tokenizer
print("Loading LLaMA model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", device_map="auto", torch_dtype=torch.float16)

# Define local LLaMA processing function
def process_with_llama(raw_text):
    prompt = f"""
    You are a data processing assistant. Your task is to transform the given text into JSON format with the following fields:
    - "title": A brief title summarizing the text.
    - "summary": A concise summary.
    - "key_points": A list of key points.
    - "tags": Relevant tags.

    Text: "{raw_text}"

    Please generate JSON in the format:
    {{
        "title": "...",
        "summary": "...",
        "key_points": ["...", "..."],
        "tags": ["...", "..."]
    }}
    """
    try:
        # Tokenize input
        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

        # Generate response
        outputs = model.generate(**inputs, max_length=300, temperature=0.7)

        # Decode the response
        output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print("\nRaw Model Output:\n", output_text)  # Debugging

        # Extract JSON from the output text
        json_start = output_text.find('{')
        json_end = output_text.rfind('}') + 1
        if json_start == -1 or json_end == 0:
            print("Error: JSON structure not found in model output.")
            return None

        json_data = output_text[json_start:json_end]
        print("\nExtracted JSON:\n", json_data)  # Debugging

        # Validate the JSON structure
        return ProcessedData.parse_raw(json_data)
    except ValidationError as e:
        print("Validation Error:", e)
        return None
    except Exception as e:
        print("Processing Error:", e)
        return None

# Provide raw input text
raw_text = """
Cloud computing provides flexible resources over the internet, enabling on-demand access to servers, storage, and applications.
Companies adopt it for scalability and cost efficiency.
"""

# Process input using the local LLaMA model
print("Processing with Local LLaMA Model...")
llama_output = process_with_llama(raw_text)
if llama_output:
    print("\nLocal LLaMA Model Output:")
    print(json.dumps(llama_output.dict(), indent=4))
else:
    print("Failed to process with Local LLaMA Model.")


Loading LLaMA model and tokenizer...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Processing with Local LLaMA Model...


KeyboardInterrupt: 

In [None]:
# Compare outputs from OpenAI and Local Model
if openai_output and local_model_output:
    comparison = {
        "OpenAI Output": openai_output.dict(),
        "Local Model Output": local_model_output.dict()
    }
    print("\nComparison:")
    print(json.dumps(comparison, indent=4))
else:
    print("Comparison not possible. One or both outputs are missing.")


# Comparision

In [None]:
def compare_models(raw_text):
    tokenizer, model = load_local_model()

    # OpenAI API
    print("Processing with OpenAI API...")
    openai_output = process_with_openai(raw_text)

    # Local Model
    print("Processing with Local Model...")
    local_output = process_with_local_model(raw_text, tokenizer, model)

    # Compare Outputs
    comparison = {
        "OpenAI Output": openai_output.dict() if openai_output else "Error",
        "Local Model Output": local_output.dict() if local_output else "Error"
    }

    return comparison


# 9. Run the Pipeline

In [None]:

raw_text = """
Cloud computing provides flexible resources over the internet, enabling on-demand access to servers, storage, and applications.
Companies adopt it for scalability and cost efficiency. """

result = compare_models(raw_text)
print(json.dumps(result, indent=4))




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Processing with OpenAI API...
API Error: The model `text-davinci-003` has been deprecated, learn more here: https://platform.openai.com/docs/deprecations
Processing with Local Model...


KeyboardInterrupt: 

# Save Result

In [None]:
with open("comparison_results.json", "w") as file:
    json.dump(result, file, indent=4)
print("Results saved to comparison_results.json")
