In [None]:
!pip install -U langchain-ollama
!pip install faiss-cpu
!pip install -U langchain-community

In [7]:
%pip install -r requirements.txt

Collecting python-dotenv==1.0.1 (from -r requirements.txt (line 2))
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1
Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
import requests
import subprocess
from dotenv import load_dotenv

GITHUB_API_URL="https://api.github.com"
OUTPUT_DIR = "repos"

load_dotenv()
BASE_REPO = os.getenv("BASE_REPO")

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)


# Clones a single repo from github to OUTPUT_DIR
def clone_repo(repo):
    dir_name = repo.replace("/", "_")
    clone_path = f"{OUTPUT_DIR}/{dir_name}"
    # If folder does not exist, clone the repository
    if not os.path.exists(clone_path):
        print(f"Cloning: https://github.com/{repo}")
        subprocess.run(["git", "clone", f"https://github.com/{repo}", dir_name], cwd=OUTPUT_DIR, check=True)
    else:
        print(f"Already cloned: {clone_path}")

# Clone all repositories from a list
def clone_repos(repos):
    for repo in repos:
        clone_repo(repo)

# Returns a list of fork urls from given repository
def get_forks(repo, count):
    page = 1
    per_page = min(count, 100)
    params = {"per_page": per_page}
    url = f"{GITHUB_API_URL}/repos/{repo}/forks?page={page}"

    r = requests.get(url, params=params)
    if r.status_code == 200:
        forks = r.json()
        fork_urls = []
        for fork in forks:
            # full_name = e.g. kgabis/parson
            fork_urls.append(fork["full_name"])
        return fork_urls
    else:
        print(f"Failed getting forks: {r.status_code}")
        return []

# Clone the base repository
clone_repo(BASE_REPO)

# Get base repo forks
forks = get_forks(BASE_REPO, 10)

# Clone forks sources
clone_repos(forks)



Cloning: https://github.com/kgabis/parson
Cloning: https://github.com/hibiscus-desgin/parson
Cloning: https://github.com/AKJUS/parson
Cloning: https://github.com/sayo9394/parson
Cloning: https://github.com/soft9000/JSONRead4C
Cloning: https://github.com/gmh5225/parson
Cloning: https://github.com/zhuqi77hub/parson
Cloning: https://github.com/lizhuo-eric/parson
Cloning: https://github.com/oss-evaluation-repository/kgabis-parson
Cloning: https://github.com/aiworkspace/parson
Cloning: https://github.com/DavidKorczynski/parson


## LLM analysis

In [None]:
from langchain_ollama.llms import OllamaLLM
from langchain_ollama import OllamaEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document

model_name = "marco-o1"

def load_code_files(dir):
    source_code_files = []

    # List files
    for f in os.listdir(dir):
        # Read only C files (for now)
        if f.endswith(".c"):
            with open(os.path.join(dir, f), 'r', encoding='utf-8') as codefile:
                # Wrap source code into Document objects
                source_code_files.append(Document(page_content=codefile.read()))

    return source_code_files

BASE_SOURCE_CODE = load_code_files(f"{OUTPUT_DIR}/kgabis_parson")
# Testing with random fork
COMPARE_SOURCE_CODE = load_code_files(f"{OUTPUT_DIR}/soft9000_JSONRead4C")

# Load embedding model
embeddings = OllamaEmbeddings(model=model_name)

# Create FAISS vector stores
base_vectorstore = FAISS.from_documents(BASE_SOURCE_CODE, embeddings)
compare_vectorstore = FAISS.from_documents(COMPARE_SOURCE_CODE, embeddings)

# Save and reload the vector stores
base_vectorstore.save_local("faiss_index_base")
base_persisted_vectorstore = FAISS.load_local("faiss_index_base", embeddings, allow_dangerous_deserialization=True)

compare_vectorstore.save_local("faiss_index_compare")
compare_persisted_vectorstore = FAISS.load_local("faiss_index_compare", embeddings, allow_dangerous_deserialization=True)

# Create a retrievers
base_retriever = base_persisted_vectorstore.as_retriever()
compare_retriever = compare_persisted_vectorstore.as_retriever()

# Load the LLM model
model = OllamaLLM(model=model_name, temperature=0.1)

def compare_code(query):
    base_results = base_retriever.get_relevant_documents(query)
    compare_results = compare_retriever.get_relevant_documents(query)

    base_snippets = "\n\n".join([doc.page_content for doc in base_results])
    compare_snippets = "\n\n".join([doc.page_content for doc in compare_results])

    prompt = f"""
    ### **Instructions:**
    You are given source code from a base repository and a fork of that repository. 

    **Base Version:**
    {base_snippets}

    **Forked Version:**
    {compare_snippets}

    
    Assess the degree of similarity between the two repositories:
    - **Perform semantic comparisons between code segments**
    - **Focus on identifying refactoring patterns and significant alterations**

    ### **Additional Instructions:**
    - You must never hallucinate
    - You have to always answer in English
    - Make your response clear and structured
    """

    analysis = model.invoke(prompt)
    return analysis

query = "Analyze the structure and patterns of this code."
analysis_result = compare_code(query)

print("LLM Analysis:\n", analysis_result)
with open("results.txt", 'w', encoding='utf-8') as f:
    f.write(analysis_result)
