In [5]:
import os

def split_file(file_path, output_dir, chunk_size):
    """
    Splits a file into smaller chunks.
    
    Args:
        file_path (str): Path to the file to be split.
        output_dir (str): Directory where the chunks will be saved.
        chunk_size (int): Size of each chunk in bytes.
    
    Returns:
        None
    """
    try:
        # Ensure the output directory exists
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        
        # Open the file to read in binary mode
        with open(file_path, 'rb') as file:
            chunk_number = 1
            while True:
                # Read a chunk of the file
                chunk = file.read(chunk_size)
                if not chunk:
                    break
                
                # Write the chunk to a new file
                chunk_file_path = os.path.join(output_dir, f"chunk_{chunk_number}.big")
                with open(chunk_file_path, 'wb') as chunk_file:
                    chunk_file.write(chunk)
                
                print(f"Created: {chunk_file_path}")
                chunk_number += 1

        print(f"File split successfully into {chunk_number - 1} chunks!")
    except Exception as e:
        print(f"An error occurred during file splitting: {e}")

if __name__ == "__main__":
    # Define the file to be split and output directory
    file_to_split = "/Users/roshin/Documents/Slides/templates/lama/model-00001-of-00002.safetensors"
    output_directory = "/Users/roshin/Documents/Slides/templates/lama/output"
    
    # Define the chunk size in bytes (e.g., 10 MB = 10 * 1024 * 1024 bytes)
    chunk_size_in_bytes = 400 * 1024 * 1024  # 10 MB
    
    # Call the function to split the file
    split_file(file_to_split, output_directory, chunk_size_in_bytes)


Created: /Users/roshin/Documents/Slides/templates/lama/output/chunk_1.big
Created: /Users/roshin/Documents/Slides/templates/lama/output/chunk_2.big
Created: /Users/roshin/Documents/Slides/templates/lama/output/chunk_3.big
Created: /Users/roshin/Documents/Slides/templates/lama/output/chunk_4.big
Created: /Users/roshin/Documents/Slides/templates/lama/output/chunk_5.big
Created: /Users/roshin/Documents/Slides/templates/lama/output/chunk_6.big
Created: /Users/roshin/Documents/Slides/templates/lama/output/chunk_7.big
Created: /Users/roshin/Documents/Slides/templates/lama/output/chunk_8.big
Created: /Users/roshin/Documents/Slides/templates/lama/output/chunk_9.big
Created: /Users/roshin/Documents/Slides/templates/lama/output/chunk_10.big
Created: /Users/roshin/Documents/Slides/templates/lama/output/chunk_11.big
Created: /Users/roshin/Documents/Slides/templates/lama/output/chunk_12.big
Created: /Users/roshin/Documents/Slides/templates/lama/output/chunk_13.big
File split successfully into 13 ch

In [2]:
import os

def merge_chunks(output_file_path, chunks_dir):
    """
    Merges chunks back into the original file.
    
    Args:
        output_file_path (str): Path where the merged file will be saved.
        chunks_dir (str): Directory containing the chunk files to merge.
    
    Returns:
        None
    """
    try:
        # Get a sorted list of all chunk files
        chunk_files = sorted(
            [f for f in os.listdir(chunks_dir) if f.startswith("chunk_")],
            key=lambda x: int(x.split('_')[-1])  # Sort by chunk number
        )
        
        # Merge all chunks into the output file
        with open(output_file_path, 'wb') as output_file:
            for chunk_file in chunk_files:
                chunk_file_path = os.path.join(chunks_dir, chunk_file)
                with open(chunk_file_path, 'rb') as cf:
                    output_file.write(cf.read())
                print(f"Merged: {chunk_file_path}")

        print(f"File merged successfully into: {output_file_path}")
    except Exception as e:
        print(f"An error occurred during file merging: {e}")

if __name__ == "__main__":
    # Define the directory containing the chunks and the output file
    chunks_directory = "/Users/roshin/Documents/Slides/templates/mlx/output"
    merged_file_path = "/Users/roshin/Documents/Slides/templates/mlx/model_merged new.safetensors"
    
    # Call the function to merge chunks
    merge_chunks(merged_file_path, chunks_directory)


Merged: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_1
Merged: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_2
Merged: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_3
Merged: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_4
Merged: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_5
Merged: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_6
Merged: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_7
Merged: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_8
Merged: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_9
Merged: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_10
Merged: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_11
Merged: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_12
Merged: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_13
Merged: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_14
Merged: /Users/roshin/Documents/Slides/temp

In [None]:
from mlx import MLXModel, MLXTokenizer

def load_mlx_llm_model(model_path):
    """
    Loads a local MLX LLM model and tokenizer.
    
    Args:
        model_path (str): Path to the directory containing the MLX model files.
        
    Returns:
        model, tokenizer: Loaded MLX model and tokenizer.
    """
    try:
        # Load the MLX model and tokenizer
        model = MLXModel.from_pretrained(model_path)
        tokenizer = MLXTokenizer.from_pretrained(model_path)
        print("MLX LLM model and tokenizer loaded successfully!")
        return model, tokenizer
    except Exception as e:
        print(f"Error loading MLX LLM model: {e}")
        return None, None

def generate_response(prompt, model, tokenizer, max_length=100):
    """
    Generates a response to a given prompt using the MLX LLM model.
    
    Args:
        prompt (str): The input prompt text.
        model (MLXModel): Loaded MLX model.
        tokenizer (MLXTokenizer): Tokenizer for the MLX model.
        max_length (int): Maximum length of the generated response.
    
    Returns:
        str: The generated response text.
    """
    try:
        # Tokenize the input prompt
        inputs = tokenizer.encode(prompt, return_tensors="pt")
        
        # Generate a response using the model
        outputs = model.generate(
            inputs,
            max_length=max_length,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            temperature=0.7,
            top_k=50,
            top_p=0.95,
            do_sample=True
        )
        
        # Decode the output tokens
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response
    except Exception as e:
        print(f"Error generating response: {e}")
        return None

if __name__ == "__main__":
    # Path to the local MLX model directory
    model_directory = "./mlx_llm_model"  # Replace with the actual path to your model
    
    # Load the MLX LLM model and tokenizer
    mlx_model, mlx_tokenizer = load_mlx_llm_model(model_directory)
    
    if mlx_model and mlx_tokenizer:
        # Example prompt
        example_prompt = "What are the applications of large language models?"
        
        # Generate a response
        response = generate_response(example_prompt, mlx_model, mlx_tokenizer)
        
        if response:
            print("Generated Response:")
            print(response)


In [None]:
from mlx_lm import load, generate
model_path = "mlx-community/Mistral-7B-Instruct-v0.2-4bit"
prompt = prompt_builder("Great content, thank you!")
max_tokens = 140

model, tokenizer = load("mlx-community/Mistral-7B-Instruct-v0.2-4bit")
response = generate(model, tokenizer, prompt=prompt, max_tokens = max_tokens,verbose=True)

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from urllib.parse import urljoin

# Step 1: Web Crawler to Scrape Links and Sub-links
def crawl_website(base_url, max_depth=2):
    visited_urls = set()
    content_data = []

    def scrape_page(url, depth):
        if depth > max_depth or url in visited_urls:
            return
        
        print(f"Scraping: {url}")
        visited_urls.add(url)
        
        try:
            response = requests.get(url, timeout=10)
            if response.status_code != 200:
                return
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract text content
            page_text = " ".join([p.get_text() for p in soup.find_all('p')])
            if page_text.strip():
                content_data.append({"url": url, "content": page_text.strip()})
            
            # Find all sub-links on the page
            for link in soup.find_all('a', href=True):
                absolute_link = urljoin(url, link['href'])
                # Only follow links within the same domain
                if base_url in absolute_link and absolute_link not in visited_urls:
                    scrape_page(absolute_link, depth + 1)
        
        except Exception as e:
            print(f"Failed to scrape {url}: {e}")

    # Start scraping from the base URL
    scrape_page(base_url, depth=0)
    return content_data

# Step 2: Save Data in LLM Training-Friendly Format
def save_data_to_llm_format(data, output_file="website_data.jsonl"):
    """
    Save data in JSONL format where each line represents a training example:
    {
        "url": "page_url",
        "content": "text_content"
    }
    """
    import json
    
    with open(output_file, 'w', encoding='utf-8') as file:
        for entry in data:
            json.dump(entry, file)
            file.write('\n')

# Example usage
if __name__ == "__main__":
    base_url = "https://example.com"  # Replace with the website you want to scrape
    max_depth = 2  # Define the depth of crawling

    # Crawl the website
    crawled_data = crawl_website(base_url, max_depth=max_depth)
    print(f"Scraped {len(crawled_data)} pages.")

    # Save data in JSONL format
    save_data_to_llm_format(crawled_data, output_file="training_data.jsonl")
    print("Data saved in LLM training-friendly format: training_data.jsonl")


In [None]:
To implement embeddings locally using the mlx_lm library, you can follow the steps below. This script will use mlx_lm to create embeddings for Java, shell scripts, and SQL stored procedures, store them in a vector database, and allow for semantic search.

Prerequisites

	1.	Install the required libraries:

pip install mlx-lm sentence-transformers faiss-cpu


	2.	Ensure your local environment has access to the source code files.

Script: Embedding and Semantic Search

import os
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Initialize the model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight model for embedding

# Define directories and file types
source_code_dir = './source_code'  # Directory containing Java, shell, SQL files
file_types = ['.java', '.sh', '.sql']  # File extensions to process

# Step 1: Extract content from source code files
def extract_files(directory, file_extensions):
    file_contents = []
    file_paths = []
    
    for root, _, files in os.walk(directory):
        for file in files:
            if any(file.endswith(ext) for ext in file_extensions):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                    file_contents.append(content)
                    file_paths.append(file_path)
    
    return file_contents, file_paths

# Step 2: Generate embeddings for source code
def generate_embeddings(contents, model):
    return model.encode(contents, convert_to_numpy=True)

# Step 3: Create FAISS index for embeddings
def create_faiss_index(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

# Step 4: Perform semantic search
def search(query, model, index, contents, paths, top_k=5):
    query_embedding = model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, top_k)
    
    results = []
    for i in range(len(indices[0])):
        results.append({
            "file_path": paths[indices[0][i]],
            "content": contents[indices[0][i]],
            "distance": distances[0][i]
        })
    return results

# Main script execution
if __name__ == "__main__":
    print("Extracting source code...")
    file_contents, file_paths = extract_files(source_code_dir, file_types)
    
    print("Generating embeddings...")
    embeddings = generate_embeddings(file_contents, model)
    
    print("Creating FAISS index...")
    index = create_faiss_index(embeddings)
    
    # Query the index
    print("Ready for semantic search. Enter your query:")
    while True:
        query = input("Query (or type 'exit' to quit): ")
        if query.lower() == 'exit':
            break
        
        results = search(query, model, index, file_contents, file_paths)
        print("\nTop results:")
        for res in results:
            print(f"File: {res['file_path']}")
            print(f"Distance: {res['distance']}")
            print("Snippet:")
            print(res['content'][:500], "...\n")

How It Works:

	1.	Extract Files:
	•	Scans the specified directory for .java, .sh, and .sql files.
	•	Reads and stores their contents.
	2.	Generate Embeddings:
	•	Converts each file’s content into a vector using SentenceTransformer.
	3.	Create FAISS Index:
	•	Embeddings are stored in a FAISS index for fast similarity searches.
	4.	Semantic Search:
	•	Users can input queries, and the script retrieves the top k most similar files/content snippets.

Usage

	1.	Place your source code files in the ./source_code directory.
	2.	Run the script.
	3.	Enter a query like:
	•	"Find the retry logic implementation"
	•	"How is error handling performed?"
	4.	The script will return the most relevant files and snippets.

This approach provides an efficient way to work with and analyze large codebases locally!

In [None]:
import os
import re
from mlx_lm import LMModel, Trainer, DataProcessor, Tokenizer

# Paths
SOURCE_CODE_DIR = './legacy_code'
MODEL_DIR = './model_output'
MODEL_NAME = 'all-MiniLM-L6-v2'

# File types to process
FILE_TYPES = ['.java', '.sh', '.sql']


# Step 1: Extract Source Code
def extract_files(directory, file_types):
    """
    Extract source code from files in a directory.
    """
    file_contents = []
    file_paths = []

    for root, _, files in os.walk(directory):
        for file in files:
            if any(file.endswith(ext) for ext in file_types):
                file_path = os.path.join(root, file)
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        content = f.read()
                        file_contents.append(content)
                        file_paths.append(file_path)
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")
    return file_contents, file_paths


# Step 2: Preprocess Code
def preprocess_code(code_snippets):
    """
    Clean and tokenize source code snippets.
    """
    processed_snippets = []
    for snippet in code_snippets:
        # Remove comments and extra whitespace
        snippet = re.sub(r'//.*?(\n|$)|/\*.*?\*/', '', snippet, flags=re.DOTALL)
        snippet = re.sub(r'\s+', ' ', snippet).strip()
        processed_snippets.append(snippet)
    return processed_snippets


# Step 3: Train the Model
def train_model(train_texts, eval_texts):
    """
    Train a model using mlx_lm.
    """
    # Initialize Tokenizer
    tokenizer = Tokenizer(MODEL_NAME)

    # Preprocess and tokenize the data
    train_data = DataProcessor(train_texts, tokenizer)
    eval_data = DataProcessor(eval_texts, tokenizer)

    # Initialize Model
    model = LMModel(model_name=MODEL_NAME)

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        train_data=train_data,
        eval_data=eval_data,
        output_dir=MODEL_DIR,
        epochs=3,
        batch_size=8,
        learning_rate=5e-5,
        save_steps=100
    )

    # Train the model
    trainer.train()

    # Save the model
    trainer.save_model()


# Step 4: Generate Suggestions
def generate_suggestions(input_code, model, tokenizer):
    """
    Generate modernization suggestions for input code.
    """
    processed_code = tokenizer.tokenize([input_code])
    predictions = model.predict(processed_code)
    return tokenizer.detokenize(predictions)


# Main Script
if __name__ == "__main__":
    print("Extracting source code...")
    code_snippets, file_paths = extract_files(SOURCE_CODE_DIR, FILE_TYPES)

    if not code_snippets:
        print("No files found for processing. Please check the source directory.")
        exit()

    print("Preprocessing code...")
    preprocessed_snippets = preprocess_code(code_snippets)

    # Split data for training and evaluation
    train_size = int(0.8 * len(preprocessed_snippets))
    train_texts = preprocessed_snippets[:train_size]
    eval_texts = preprocessed_snippets[train_size:]

    print("Training the model...")
    train_model(train_texts, eval_texts)

    print("Loading model and tokenizer for inference...")
    tokenizer = Tokenizer(MODEL_NAME)
    model = LMModel.load(MODEL_DIR)

    print("Generating suggestions...")
    test_snippet = "public void legacyMethod() { int x = 10; System.out.println(x); }"
    suggestions = generate_suggestions(test_snippet, model, tokenizer)
    print(f"Suggestions for modernization:\n{suggestions}")


In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import os
import time
import json
from urllib.parse import urljoin, urlparse

# Initialize Safari WebDriver
driver = webdriver.Safari()

# Base URL to scrape
BASE_URL = "https://example.com"  # Replace with the URL you want to scrape
OUTPUT_FILE = "llm_training_dataset.json"

# Track visited URLs to avoid duplication
visited_urls = set()

# Initialize dataset
dataset = []

def extract_content(driver):
    """
    Extracts all text content from the current webpage.
    """
    try:
        return driver.find_element(By.TAG_NAME, "body").text
    except Exception as e:
        print(f"Error extracting content: {e}")
        return ""

def save_dataset(dataset, output_file):
    """
    Saves the dataset to a JSON file.
    """
    with open(output_file, "w", encoding="utf-8") as file:
        json.dump(dataset, file, ensure_ascii=False, indent=4)

def visit_and_extract(url):
    """
    Visits a URL, extracts content, and explores its links.
    """
    if url in visited_urls:
        return
    print(f"Visiting: {url}")
    visited_urls.add(url)

    try:
        driver.get(url)
        time.sleep(2)  # Allow time for the page to load

        # Extract content
        content = extract_content(driver)
        if content.strip():
            dataset.append({"url": url, "content": content})

        # Find all links on the page
        links = driver.find_elements(By.TAG_NAME, "a")
        for link in links:
            href = link.get_attribute("href")
            if href and href.startswith(BASE_URL):  # Ensure links are within the same domain
                visit_and_extract(href)
    except Exception as e:
        print(f"Error visiting {url}: {e}")

# Start scraping from the base URL
try:
    visit_and_extract(BASE_URL)
finally:
    # Close the WebDriver
    driver.quit()

    # Save the dataset locally
    save_dataset(dataset, OUTPUT_FILE)
    print(f"Dataset saved to {OUTPUT_FILE}")
