In [1]:
import urllib.request
import json

def query_model(prompt, model="llama3.2:3b", url="http://localhost:11434/api/chat", role="user"):
    # Create the data payload as a dictionary
    data = {
        "model": model,
        "top_p": 1,
        "stream": False,
        "format": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "text1": {"type": "string", "maxLength": 150},
                    "text2": {"type": "string", "maxLength": 150},
                    "paraphrase": {"type": "boolean"}
                },
                "required": ["text1", "text2", "paraphrase"],
                "additionalProperties": False
            }
        },
        
        "messages": [
            {"role": role, "content": prompt}
        ]
    }

    # Convert the dictionary to a JSON formatted string and encode it to bytes
    payload = json.dumps(data).encode("utf-8")

    # Create a request object, setting the method to POST and adding necessary headers
    request = urllib.request.Request(url, data=payload, method="POST")
    request.add_header("Content-Type", "application/json")

    # Send the request and capture the response
    response_data = ""
    with urllib.request.urlopen(request) as response:
        # Read and decode the response
        while True:
            line = response.readline().decode("utf-8")
            if not line:
                break
            response_json = json.loads(line)
            response_data += response_json["message"]["content"]

    return response_data

In [2]:
query = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>"
query = "what is the color of the sky at different times of day, respond in json"
result = query_model(query, role="assistant")
print(result)

[]
  		     			  		  		


In [3]:
template = "Your task is to generate dataset for an ai for paraphrase detection system that when given two text1 and text2 tells similarity between the two in english only. so write 10 such examples in json format like {'text1': 'today is monday', 'text2':'it is monday today', 'paraphrase': true} "
result = query_model(template)
print(result)

[{ "text1": "The company will hold an annual meeting.", "text2": "The company will have its yearly gathering.", "paraphrase": true }, { "text1": "I love to play football.", "text2": "I am passionate about soccer.", "paraphrase": false }, { "text1": "He is going to the store.", "text2": "He is off to purchase something.", "paraphrase": true }, { "text1": "The new policy has been implemented.", "text2": "A new law has taken effect today.", "paraphrase": true }, { "text1": "I am a software engineer.", "text2": "I design and develop software.", "paraphrase": true }, { "text1": "The sun is shining brightly.", "text2": "It's a beautiful day outside.", "paraphrase": true }, { "text1": "She has been studying English for three years.", "text2": "She has been learning English for years.", "paraphrase": true }, { "text1": "The project is expected to be completed by the end of next month.", "text2": "We anticipate finishing the project by the last day of next month.", "paraphrase": true }, { "text

In [4]:
import json
from datetime import datetime

def generate_and_save_paraphrase_data(file_path="paraphrase_data.json"):
    template = """Your task is to generate dataset for an AI paraphrase detection system. 
    Create 10 examples in JSON format like {'text1': '...', 'text2': '...', 'paraphrase': true/false}.
    Include both positive and negative examples (some non-paraphrases)."""
    
    # Get raw response from LLM
    result = query_model(template)
    
    try:
        # Extract JSON array from markdown response
        json_start = result.find('[')
        json_end = result.rfind(']') + 1
        json_data = json.loads(result[json_start:json_end])
        
        # Create structured format with metadata
        structured_data = {
            "metadata": {
                "created_at": datetime.now().isoformat(),
                "task": "paraphrase_detection",
                "version": "1.0"
            },
            "dataset": json_data
        }
        
        # Save to file
        with open(file_path, 'w') as f:
            json.dump(structured_data, f, indent=4)
            
        print(f"Data successfully saved to {file_path}")
        
    except (json.JSONDecodeError, ValueError) as e:
        print(f"Error parsing response: {e}")
        print("Raw response:")
        print(result)

# Example usage
generate_and_save_paraphrase_data()

Data successfully saved to paraphrase_data.json


In [7]:
import json
import time
import os
import re
from tqdm.notebook import tqdm

class ParaphraseDataGenerator:
    def __init__(self, target_samples=100000, batch_size=200):
        self.target_samples = target_samples
        self.batch_size = batch_size
        self.output_file = "paraphrase_data.jsonl"
        self.checkpoint_file = "checkpoint.txt"
        self.current_count = 0
        self._load_checkpoint()
        
    def _generate_batch(self):
        prompt = f"""Generate {self.batch_size} diverse paraphrase pairs in english only with:
        - Strict JSON format ONLY, no extra text
        - All boolean values as true/false (lowercase)
        - Properly escaped quotes
        - No trailing commas
        Return ONLY this format:
        [{{"text1": "...", "text2": "...", "paraphrase": bool}}]"""
        
        retries = 5  # Increased retries
        for attempt in range(retries):
            try:
                response = query_model(prompt)
                # print("Raw response:", response)  # Debug logging
                
                # Enhanced JSON extraction
                cleaned_response = self._clean_response(response)
                return self._safe_json_parse(cleaned_response)
                
            except (json.JSONDecodeError, ValueError) as e:
                print(f"Attempt {attempt+1}/{retries} failed: {str(e)}")
                time.sleep(2 ** attempt)  # Exponential backoff
        raise Exception(f"Failed after {retries} attempts. Last response: {response[:200]}")

    def _clean_response(self, response):
        # Remove markdown code blocks
        response = re.sub(r'```json\n?|```', '', response)
        
        # Remove non-JSON characters before first [ and after last ]
        json_start = response.find('[')
        json_end = response.rfind(']') + 1
        if json_start == -1 or json_end == 0:
            raise ValueError("No JSON array found in response")
            
        return response[json_start:json_end]

    def _safe_json_parse(self, json_str):
        # First try parsing as normal
        try:
            return json.loads(json_str)
        except json.JSONDecodeError as e:
            print(f"Standard parse failed, trying recovery: {e}")

        # If that fails, try finding individual JSON objects
        objects = []
        decoder = json.JSONDecoder()
        offset = 0
        
        # Remove whitespace and problematic characters
        json_str = json_str.strip().replace('\n', ' ').replace('\t', ' ')
        
        while offset < len(json_str):
            try:
                # Skip commas between objects
                while offset < len(json_str) and json_str[offset] in (' ', ',', '\n', '\r', '\t'):
                    offset += 1
                
                if offset >= len(json_str):
                    break

                obj, offset = decoder.raw_decode(json_str, idx=offset)
                if isinstance(obj, dict):
                    objects.append(obj)
                elif isinstance(obj, list):
                    objects.extend(obj)
                    
            except json.JSONDecodeError as e:
                print(f"Skipping invalid JSON at position {offset}: {e}")
                # Try to find next valid object
                next_brace = json_str.find('{', offset + 1)
                if next_brace == -1:
                    break
                offset = next_brace
                
        # Validate objects
        valid_objects = []
        for obj in objects:
            if all(key in obj for key in ['text1', 'text2', 'paraphrase']):
                # Fix common boolean issues
                if isinstance(obj['paraphrase'], str):
                    obj['paraphrase'] = obj['paraphrase'].lower() == 'true'
                valid_objects.append(obj)
        
        if not valid_objects:
            raise ValueError("No valid objects found after parsing")
            
        print(f"Recovered {len(valid_objects)} valid objects from batch")
        return valid_objects

    def _load_checkpoint(self):
        if os.path.exists(self.checkpoint_file):
            with open(self.checkpoint_file, 'r') as f:
                self.current_count = int(f.read().strip())
            print(f"Resuming from checkpoint: {self.current_count}/{self.target_samples}")

    def _save_checkpoint(self):
        with open(self.checkpoint_file, 'w') as f:
            f.write(str(self.current_count))

    def _save_batch(self, batch):
        with open(self.output_file, 'a') as f:
            for item in batch:
                f.write(json.dumps(item) + '\n')

    def generate(self):
        pbar = tqdm(total=self.target_samples, initial=self.current_count)
        
        while self.current_count < self.target_samples:
            batch = self._generate_batch()
            valid_batch = [item for item in batch if all(k in item for k in ['text1', 'text2', 'paraphrase'])]
            
            self._save_batch(valid_batch)
            self.current_count += len(valid_batch)
            
            # Update progress and checkpoint
            pbar.update(len(valid_batch))
            self._save_checkpoint()
            
            # Rate limiting
            time.sleep(1.5)  # Adjust based on API limits
            
            # Memory management
            if self.current_count % 10000 == 0:
                print(f"Intermediate checkpoint: {self.current_count} samples")
                
        pbar.close()
        print(f"Completed generating {self.current_count} samples")



In [8]:
# Test with 100 samples first
test_generator = ParaphraseDataGenerator(target_samples=10000, batch_size=10)
test_generator.generate()

Resuming from checkpoint: 1001/10000


 10%|#         | 1001/10000 [00:00<?, ?it/s]

Completed generating 10001 samples


In [9]:
def inspect_data(num_samples=5):
    with open('paraphrase_data.jsonl', 'r') as f:
        for _ in range(num_samples):
            print(json.loads(f.readline()))
inspect_data()

{'text1': 'The company has decided to increase salaries.', 'text2': 'The company will boost its pay.', 'paraphrase': True}
{'text1': 'This is a very long text that needs to be rewritten.', 'text2': 'We need to shorten this lengthy sentence.', 'paraphrase': False}
{'text1': 'The new employee was struggling with the workload.', 'text2': 'The new guy was overwhelmed by his tasks.', 'paraphrase': True}
{'text1': 'It is essential that you complete your homework on time.', 'text2': 'Make sure to finish your assignments ahead of schedule.', 'paraphrase': True}
{'text1': 'The city has implemented a new recycling program.', 'text2': 'We are introducing an eco-friendly waste collection system.', 'paraphrase': True}


In [11]:
!pip install -U -q sentence-transformers


[notice] A new release of pip is available: 24.1.2 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:
from sentence_transformers import SentenceTransformer, util
sentences = ["This is an apple", "This is a cat"]

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(sentences, convert_to_tensor=True)
sim = util.pytorch_cos_sim(embeddings[0], embeddings[1])
print(sim)


tensor([[0.4559]], device='cuda:0')


In [20]:
import pandas as pd
data = pd.read_json("paraphrase_data.jsonl")
data.head()

ValueError: Trailing data

In [16]:
import torch
import torch.nn as nn

class MultiHeadAttention(nn.Module):
    def __init__(
        self,
        hidden_dim,
        num_heads
    ):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.num_heads = num_heads
        self.head_dim = hidden_dim//num_heads
        self.wq = nn.Linear(hidden_dim, hidden_dim)
        self.wk = nn.Linear(hidden_dim, hidden_dim)
        self.wv = nn.Linear(hidden_dim, hidden_dim)
        self.wo = nn.Linear(hidden_dim, hidden_dim)
        
    def forward(self, x):
        b, s, d = x.shape
        q = self.wq(x)
        k = self.wk(x)
        v = self.wv(x)

        q = q.view(b, s, self.num_heads, self.head_dim)
        k = k.view(b, s, self.num_heads, self.head_dim)
        v = v.view(b, s, self.num_heads, self.head_dim)

        q = q.transpose(1,2)
        k = k.transpose(1,2)
        v = v.transpose(1,2)

        atten_score = torch.softmax((q@k.transpose(2,3))/(self.head_dim**0.5), dim=-1)
        o = atten_score@v
        o = o.transpose(1,2).contiguous().view(b, s, d)
        return self.wo(o)
        
class EncoderBlock(nn.Module):
    def __init__(
        self,
        hidden_dim,
        num_heads,
    ):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.num_heads = num_heads
        self.layernorm1 = nn.LayerNorm(hidden_dim)
        self.layernorm2 = nn.LayerNorm(hidden_dim)
        self.mha = MultiHeadAttention(hidden_dim, num_heads)
        self.ffn = nn.Sequential(
            nn.Linear(hidden_dim, 4*hidden_dim),
            nn.GELU(),
            nn.Linear(4*hidden_dim, hidden_dim)
        )

    def forward(self, x):
        x = self.layernorm1(x)
        x = self.mha(x)
        x = self.ffn(x)
        x = self.layernorm2(x)
        return x
        
class Model(nn.Module):
    def __init__(
        self,
        vocab_size,
        hidden_dim,
        num_heads,
        num_layers,
    ):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.num_heads = num_heads
        self.num_layers = num_layers
        self.embedding = nn.Embedding(vocab_size,hidden_dim)
        self.layers = nn.ModuleList([EncoderBlock(self.hidden_dim, self.num_heads) for _ in range(num_layers)])
        self.out = nn.Linear(self.hidden_dim, self.vocab_size)
        
    def forward(self, x):
        x = self.embedding(x)
        for layer in self.layers:
            x = layer(x)
        x = self.out(x)
        return x
        

In [17]:
x = torch.randint(0, 80, (16, 16))
model = Model(100, 64, 4, 4)
x = model(x)
x.shape

torch.Size([16, 16, 100])

In [None]:
import torch.optim as optim
from torch.utils.data import DataLoader 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# hyper params
epochs = 100
batch_size = 64
lr = 0.001
vocab_size = 1000
hidden_dim = 64
num_heads = 4
num_layers = 4
model = Model(vocab_size, hidden_dim, num_heads, num_layers)

optimizer = optim.Adam(model.parameters(), lr=lr)
loss = nn.CrossEntropy()
dataloader = DataLoader()

for epoch in range(epochs):
    running_loss = 0
    n = len(dataloader)
    for i, (x, y) in enumerate(dataloader):
        x = x.to(device)
        y = y.to(device)
        logits = model(x)
        l = loss(logits, y)
        l.backward()
        optimizer.step()
        optimizer.zero_grad()
        running_loss+=l.item()
        
    print(f"Epoch : {epoch} || Loss : {l.item()}")
        