In [1]:
!pip install langchain_community



In [2]:
!pip install chromadb



In [None]:
import os
import requests
import json
import base64
import time
import re
from datetime import datetime
import tempfile 
import shutil 
from langchain.text_splitter import Language, RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import PromptTemplate
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_community.llms import HuggingFacePipeline
from tqdm import tqdm 



github_secret = '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'



OUTPUT_DIR_BASE = "pull_request_data_structured"
API_VERSION = '2022-11-28'
PER_PAGE = 100 
REQUEST_TIMEOUT = 60 


def make_api_request(url, headers, params=None):

    headers['X-GitHub-Api-Version'] = API_VERSION
    try:
        response = requests.get(url, headers=headers, params=params, timeout=REQUEST_TIMEOUT)
        response.raise_for_status() 
        return response
    except requests.exceptions.Timeout:
        print(f"Timeout error making API request to {url}")
        return None
    except requests.exceptions.HTTPError as e:
        print(f"HTTP error making API request to {url}: {e}")
        print(f"Response Status: {e.response.status_code}")
        print(f"Response Body: {e.response.text}")
        ''
        ''
        return None
    except requests.exceptions.RequestException as e:
        print(f"Error making API request to {url}: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred during API request to {url}: {e}")
        return None

def fetch_paginated_data(url, headers, params=None):

    if params is None:
        params = {}
    params['per_page'] = PER_PAGE
    all_items = []
    current_url = url
    page = 1

    while current_url:
         
        cleaned_url = current_url.replace('[https://', 'https://').replace(']', '')
        print(f"      Fetching page {page} from {cleaned_url.split('?')[0]}...")

         
         
        page_params = params if page == 1 and '?' not in cleaned_url else None

        response = make_api_request(cleaned_url, headers=headers, params=page_params)
        page += 1

        if not response:
            print("Failed to fetch paginated data page. Stopping pagination.")
            break  

        if response.status_code == 200:
            try:
                items_page = response.json()
                if not items_page:  
                    break
                if isinstance(items_page, list):
                    all_items.extend(items_page)
                else:
                     
                    print(f"      Warning: Expected a list from {cleaned_url.split('?')[0]}, received type {type(items_page)}. Appending.")
                    all_items.append(items_page)  

                 
                if 'next' in response.links:
                    current_url = response.links['next']['url']
                    params = None  
                else:
                    current_url = None  
            except json.JSONDecodeError as e:
                print(f"      Error decoding JSON from {cleaned_url.split('?')[0] if cleaned_url else url}: {e}")
                print(f"      Response text: {response.text[:200]}...")  
                break  
            except Exception as e:
                 print(f"      An unexpected error occurred processing page {page-1} data: {e}")
                 break  

        else:  
            print(f"Stopping pagination. Received status {response.status_code} for page {page-1}.")
            break  

    return all_items

def get_file_content(owner, repo, file_path, commit_sha, headers):
    api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{file_path}?ref={commit_sha}"
    print(f"        Fetching content: {file_path} @ {commit_sha[:7]}")
    response = make_api_request(api_url, headers=headers)

    if response and response.status_code == 200:
        try:
            content_data = response.json()
            if isinstance(content_data, dict) and content_data.get('type') == 'file' and 'content' in content_data:
                if content_data.get('encoding') == 'base64':
                    try:
                        encoded_content = content_data['content'].replace('\n', '')
                        decoded_content = base64.b64decode(encoded_content).decode('utf-8', errors='replace')
                        return decoded_content
                    except Exception as e:
                        print(f"        Error decoding base64 content for {file_path} @ {commit_sha[:7]}: {e}")
                        return ""  
                else:
                      
                     print(f"        Warning: Content for {file_path} @ {commit_sha[:7]} not base64 encoded, returning raw.")
                     return content_data['content']
            elif isinstance(content_data, dict) and content_data.get('type') in ['dir', 'submodule', 'symlink']:
                 print(f"        Skipping content fetch for non-file type '{content_data.get('type')}' for {file_path} @ {commit_sha[:7]}.")
                 return ""  
            else:
                print(f"        Warning: Could not get file content (unexpected format or missing content) for {file_path} @ {commit_sha[:7]}. Response type: {type(content_data)}. Content type: {content_data.get('type') if isinstance(content_data, dict) else 'N/A'}")
                return ""  
        except json.JSONDecodeError as e:
            print(f"        Error decoding JSON response for file content {file_path} @ {commit_sha[:7]}: {e}")
            return ""  
    elif response and response.status_code == 404:
        print(f"        File not found (404): {file_path} @ {commit_sha[:7]}")
        return ""  
    else:
         
        return ""


def save_file(content, base_dir, relative_path):

    if content is None or content == "":  
         
        return False
    try:
         
        full_path = os.path.join(base_dir, relative_path)

         
        parent_dir = os.path.dirname(full_path)
        if parent_dir:  
            os.makedirs(parent_dir, exist_ok=True)

         
        with open(full_path, 'w', encoding='utf-8') as f:
            f.write(content)
         
        return True
    except IOError as e:
        print(f"          Error writing file {full_path}: {e}")
        return False
    except Exception as e:
        print(f"          Unexpected error saving file {full_path}: {e}")
        return False


def process_pull_request_files(owner, repo, pr_number, base_sha, head_sha, headers, pr_output_dir):

    api_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}/files"
    files_list = fetch_paginated_data(api_url, headers=headers)

    if not files_list:
        print(f"    No files found or error fetching files for PR ") 
        return []  

    print(f"    Processing {len(files_list)} files for PR  ")

     
    before_dir = os.path.join(pr_output_dir, "before_merge")
    after_dir = os.path.join(pr_output_dir, "after_merge")
    patch_dir = os.path.join(pr_output_dir, "changed_files")
    os.makedirs(before_dir, exist_ok=True)
    os.makedirs(after_dir, exist_ok=True)
    os.makedirs(patch_dir, exist_ok=True)

    processed_files_metadata = []
    for f in files_list:
         
        if not isinstance(f, dict):
             continue

        filename = f.get('filename')
        status = f.get('status')

        if not filename or not status:
             continue

        print(f"      Processing file: {filename} (Status: {status})")

         
        content_base = ""  
         
        if status != 'added' and base_sha:
            content_base = get_file_content(owner, repo, filename, base_sha, headers)
            if content_base:  
                 save_file(content_base, before_dir, filename)

         
        content_head = ""  
         
         
        if status not in ['removed', 'deleted'] and head_sha:
            content_head = get_file_content(owner, repo, filename, head_sha, headers)
            if content_head:  
                save_file(content_head, after_dir, filename)

         
        patch_content = f.get('patch')
        if patch_content:
            patch_filename = filename + ".patch"
            save_file(patch_content, patch_dir, patch_filename)

         
        processed_files_metadata.append({
            'filename': filename,
            'status': status,
            'additions': f.get('additions', 0),  
            'deletions': f.get('deletions', 0),  
            'changes': f.get('changes', 0),      
            'sha': f.get('sha'),  
            'blob_url': f.get('blob_url'),
            'raw_url': f.get('raw_url'),
            'patch_saved': bool(patch_content),  
            'content_base_saved': bool(content_base),  
            'content_head_saved': bool(content_head),  
            'previous_filename': f.get('previous_filename')  
        })

    return processed_files_metadata


def get_pr_reviews(owner, repo, pr_number, headers):
    """Fetches all reviews for a PR. Returns a list of simplified review dictionaries."""
    api_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}/reviews"
    reviews = fetch_paginated_data(api_url, headers=headers)
    if not reviews: return []
    return [
        {
            'id': r.get('id'),
            'user': r.get('user', {}).get('login', 'ghost'),  
            'state': r.get('state'),
            'submitted_at': r.get('submitted_at'),
            'body': r.get('body'),  
            'commit_id': r.get('commit_id')
        } for r in reviews if isinstance(r, dict)  
    ]

def get_pr_review_comments(owner, repo, pr_number, headers):
    """Fetches all review comments (inline code comments) for a PR. Returns a list of simplified comment dictionaries."""
    api_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}/comments"
    comments = fetch_paginated_data(api_url, headers=headers)
    if not comments: return []
    return [
        {
            'id': c.get('id'),
            'user': c.get('user', {}).get('login', 'ghost'),  
            'body': c.get('body'),  
            'path': c.get('path'),  
            'position': c.get('position'),  
            'original_position': c.get('original_position'),
            'commit_id': c.get('commit_id'),  
            'original_commit_id': c.get('original_commit_id'),
            'created_at': c.get('created_at'),
            'updated_at': c.get('updated_at'),
            'in_reply_to_id': c.get('in_reply_to_id')  
        } for c in comments if isinstance(c, dict)  
    ]

def get_pr_issue_comments(owner, repo, pr_number, headers):
    """Fetches all general issue comments (comments on the PR itself) for a PR. Returns a list of simplified comment dictionaries."""
    api_url = f"https://api.github.com/repos/{owner}/{repo}/issues/{pr_number}/comments"
    comments = fetch_paginated_data(api_url, headers=headers)
    if not comments: return []
    return [
        {
            'id': c.get('id'),
            'user': c.get('user', {}).get('login', 'ghost'),  
            'body': c.get('body'),  
            'created_at': c.get('created_at'),
            'updated_at': c.get('updated_at')
        } for c in comments if isinstance(c, dict)  
    ]

def get_pr_commits(owner, repo, pr_number, headers):
    """Fetches all commits associated with a PR. Returns a list of simplified commit dictionaries."""
    api_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}/commits"
    commits = fetch_paginated_data(api_url, headers=headers)
    if not commits: return []
    return [
        {
            'sha': c.get('sha'),
            'message': c.get('commit', {}).get('message'),  
            'author': c.get('commit', {}).get('author'),  
            'committer': c.get('commit', {}).get('committer'),  
            'api_author_login': c.get('author', {}).get('login') if c.get('author') else None,  
            'api_committer_login': c.get('committer', {}).get('login') if c.get('committer') else None,  
            'parents': [p.get('sha') for p in c.get('parents', []) if isinstance(p, dict) and p.get('sha')]  
        } for c in commits if isinstance(c, dict)  
    ]

def get_commit_check_runs(owner, repo, ref_sha, headers):
    """Fetches check runs (newer Checks API) for a specific commit SHA. Returns a list of simplified check run dictionaries."""
    if not ref_sha: return []
    print(f"    Fetching check runs for commit {ref_sha[:7]}...")
    api_url = f"https://api.github.com/repos/{owner}/{repo}/commits/{ref_sha}/check-runs"
     
    params={'per_page': 100}  
    response = make_api_request(api_url, headers=headers, params=params)

    if response and response.status_code == 200:
        try:
            data = response.json()
            check_runs_list = []
             
            if isinstance(data, dict) and 'check_runs' in data and isinstance(data['check_runs'], list):
                 check_runs_list = data['check_runs']
                  
                  
            elif isinstance(data, list):  
                 check_runs_list = data
            else:
                 print(f"    Unexpected response format for check runs for commit {ref_sha[:7]}. Response type: {type(data)}. Keys: {data.keys() if isinstance(data, dict) else 'N/A'}")
                 return []

            return [
                {
                    'name': cr.get('name'),
                    'status': cr.get('status'),  
                    'conclusion': cr.get('conclusion'),  
                    'started_at': cr.get('started_at'),
                    'completed_at': cr.get('completed_at'),
                    'app_owner': cr.get('app', {}).get('owner', {}).get('login') if isinstance(cr.get('app'), dict) else None,  
                    'app_name': cr.get('app', {}).get('name') if isinstance(cr.get('app'), dict) else None  
                     
                } for cr in check_runs_list if isinstance(cr, dict)  
            ]
        except json.JSONDecodeError as e:
            print(f"    Error decoding JSON for check runs commit {ref_sha[:7]}: {e}")
            return []
        except Exception as e:
             print(f"    An unexpected error occurred processing check runs for commit {ref_sha[:7]}: {e}")
             return []
    return []  

def get_commit_statuses(owner, repo, ref_sha, headers):
    """Fetches statuses (older Status API) for a specific commit SHA. Returns a list of simplified status dictionaries."""
    if not ref_sha: return []
    print(f"    Fetching statuses for commit {ref_sha[:7]}...")
    api_url = f"https://api.github.com/repos/{owner}/{repo}/commits/{ref_sha}/statuses"
     
    statuses = fetch_paginated_data(api_url, headers=headers)  
    if not statuses: return []
    return [
        {
            'context': s.get('context'),  
            'state': s.get('state'),  
            'description': s.get('description'),  
            'target_url': s.get('target_url'),  
            'creator_login': s.get('creator', {}).get('login') if isinstance(s.get('creator'), dict) else None,  
            'created_at': s.get('created_at'),
            'updated_at': s.get('updated_at')
        } for s in statuses if isinstance(s, dict)  
    ]

def parse_linked_issues(text):

    if not text:
        return []
     
    github_refs_keyword = re.findall(r'(?:close(?:s|d)?|resolve(?:s|d)?|fix(?:es|ed)?)\s+')
     
    github_refs_simple = re.findall(r'(?<![a-zA-Z0-9])#(\d+)\b', text)
     
    jira_refs = re.findall(r'\b([A-Z][A-Z0-9_]+-\d+)\b', text)

    issues = set()
     
    for ref in github_refs_keyword: issues.add(f"GH-{ref}")
     
    for ref in github_refs_simple:
        if f"GH-{ref}" not in issues:
             issues.add(f"GH-{ref}")
     
    for ref in jira_refs: issues.add(ref)

    return sorted(list(issues))

 

def get_all_pull_requests_structured(owner, repo, state='all'):
    if not github_secret:
         print("Skipping data fetching: GITHUB_BOT_ACCESS_TOKEN is not set.")
         return []

    print(f"--- Starting STRUCTURED pull request data fetch for {owner}/{repo} ---")
    print(f"--- Output base directory: {OUTPUT_DIR_BASE} ---")
     

    os.makedirs(OUTPUT_DIR_BASE, exist_ok=True)  

    api_url = f"https://api.github.com/repos/{owner}/{repo}/pulls"
    headers = {
        'Authorization': f'token {github_secret}',
        'Accept': 'application/vnd.github.v3+json',  
    }
    params = {
        'state': state,         
        'per_page': PER_PAGE,
        'sort': 'updated',      
        'direction': 'desc',    
        'page': 1               
    }
     

    processed_pr_numbers = []  
    current_url = api_url
    page = 1

    while current_url:
         
        cleaned_url = current_url.replace('[https://', 'https://').replace(']', '')
        print(f"\nFetching page {page} of pull requests list from {cleaned_url.split('?')[0]}...")

         
         
        page_params = params if page == 1 and '?' not in cleaned_url else None

        response = make_api_request(cleaned_url, headers=headers, params=page_params)
        page += 1

        if not response:
            print("Failed to fetch pull requests list page. Stopping.")
            break  

        if response.status_code == 200:
            try:
                pull_requests_page = response.json()
                if not pull_requests_page or not isinstance(pull_requests_page, list):
                    print("No more pull requests found on this page or unexpected format.")
                    break  

                print(f"Processing {len(pull_requests_page)} pull requests from page {page-1}...")

                for pr_summary in pull_requests_page:
                     
                    if not isinstance(pr_summary, dict):
                        print(f"Skipping unexpected item in PR list (not a dictionary): {pr_summary}")
                        continue

                    pr_number = pr_summary.get('number')
                    if pr_number is None:
                         print(f"Skipping PR summary with missing number: {pr_summary}")
                         continue

                    pr_updated_at = pr_summary.get('updated_at')

                     
                    pr_output_dir = os.path.join(OUTPUT_DIR_BASE, f"pr_{pr_number}")
                    os.makedirs(pr_output_dir, exist_ok=True)

                     
                     
                     
                    pr_detail_url = pr_summary.get('url')  
                    if not pr_detail_url:
                        continue

                    pr_detail_response = make_api_request(pr_detail_url, headers=headers)
                    if not pr_detail_response or pr_detail_response.status_code != 200:
                         continue

                    try:
                        pr = pr_detail_response.json()  
                        if not isinstance(pr, dict):
                             continue
                    except json.JSONDecodeError as e:
                        continue

                     
                    base_sha = pr.get('base', {}).get('sha')  
                    head_sha = pr.get('head', {}).get('sha')  
                    pr_body = pr.get('body')  

                          
                          

                     
                     
                    files_metadata = process_pull_request_files(
                        owner, repo, pr_number, base_sha, head_sha, headers, pr_output_dir
                    )

                     
                    reviews = get_pr_reviews(owner, repo, pr_number, headers)
                    review_comments = get_pr_review_comments(owner, repo, pr_number, headers)
                    issue_comments = get_pr_issue_comments(owner, repo, pr_number, headers)
                    commits_list = get_pr_commits(owner, repo, pr_number, headers)

                    check_runs = []
                    statuses = []
                    if head_sha:
                        check_runs = get_commit_check_runs(owner, repo, head_sha, headers)
                         
                        statuses = get_commit_statuses(owner, repo, head_sha, headers)
                    
                     
                    linked_issues = set()
                    if pr_body:  
                        linked_issues.update(parse_linked_issues(pr_body))
                    for c in commits_list:
                         
                        if isinstance(c, dict) and c.get('message'):
                            linked_issues.update(parse_linked_issues(c.get('message')))
                     
                    for ic in issue_comments:
                        if isinstance(ic, dict) and ic.get('body'):
                            linked_issues.update(parse_linked_issues(ic.get('body')))
                    for r in reviews:
                         if isinstance(r, dict) and r.get('body'):
                            linked_issues.update(parse_linked_issues(r.get('body')))
                    for rc in review_comments:
                         if isinstance(rc, dict) and rc.get('body'):
                            linked_issues.update(parse_linked_issues(rc.get('body')))


                     
                    metadata = {
                        'pr_number': pr_number,
                        'api_url': pr.get('url'),
                        'html_url': pr.get('html_url'),
                        'state': pr.get('state'),
                        'title': pr.get('title'),
                        'author_login': pr.get('user', {}).get('login', 'ghost') if isinstance(pr.get('user'), dict) else 'ghost',  
                        'author_association': pr.get('author_association'),
                        'body': pr_body,  
                        'created_at': pr.get('created_at'),
                        'updated_at': pr.get('updated_at'),
                        'closed_at': pr.get('closed_at'),
                        'merged_at': pr.get('merged_at'),
                        'merge_commit_sha': pr.get('merge_commit_sha'),
                        'assignee': pr.get('assignee', {}).get('login') if isinstance(pr.get('assignee'), dict) else None,  
                        'assignees': [a.get('login') for a in pr.get('assignees', []) if isinstance(a, dict) and a.get('login')],  
                        'requested_reviewers': [rr.get('login') for rr in pr.get('requested_reviewers', []) if isinstance(rr, dict) and rr.get('login')],  
                        'requested_teams': [rt.get('slug') for rt in pr.get('requested_teams', []) if isinstance(rt, dict) and rt.get('slug')],  
                        'labels': [l.get('name') for l in pr.get('labels', []) if isinstance(l, dict) and l.get('name')],  
                        'is_draft': pr.get('draft', False),
                        'merged': pr.get('merged', False),
                        'mergeable': pr.get('mergeable'),  
                        'mergeable_state': pr.get('mergeable_state'),  
                        'merged_by_login': pr.get('merged_by', {}).get('login') if isinstance(pr.get('merged_by'), dict) else None,  
                        'base_branch': pr.get('base', {}).get('ref') if isinstance(pr.get('base'), dict) else None,  
                        'base_commit_sha': base_sha,
                        'head_branch': pr.get('head', {}).get('ref') if isinstance(pr.get('head'), dict) else None,  
                        'head_repo_full_name': pr.get('head', {}).get('repo', {}).get('full_name') if isinstance(pr.get('head'), dict) and isinstance(pr.get('head').get('repo'), dict) else None,  
                        'head_commit_sha': head_sha,
                        'reviews': reviews,
                        'review_comments': review_comments,
                        'issue_comments': issue_comments,
                        'commits_list': commits_list,
                        'commits_count': len(commits_list),  
                        'check_runs': check_runs,  
                        'statuses': statuses,      
                        'linked_issues_parsed': sorted(list(linked_issues)),
                        'changed_files_count': len(files_metadata),  
                        'total_additions': sum(f.get('additions', 0) for f in files_metadata),  
                        'total_deletions': sum(f.get('deletions', 0) for f in files_metadata),  
                        'changed_files_manifest': files_metadata  
                    }

                     
                    metadata_filename = os.path.join(pr_output_dir, "metadata.json")
                    try:
                        with open(metadata_filename, 'w', encoding='utf-8') as f:
                             
                            json.dump(metadata, f, indent=2, ensure_ascii=False)
                        print(f"    Successfully saved metadata to {metadata_filename}")
                        processed_pr_numbers.append(pr_number)
                    except IOError as e:
                        print(f"    Error writing metadata JSON file {metadata_filename}: {e}")
                    except TypeError as e:
                        print(f"    Error serializing metadata JSON for PR")
                    except Exception as e:
                        print(f"    Unexpected error saving metadata JSON for PR")

                 
                if 'next' in response.links:
                    current_url = response.links['next']['url']
                    params = None  
                    print(f"--- Moving to next page of PR list ---")
                else:
                    print("No 'next' link found in PR list response, reached the last page.")
                    current_url = None  

            except json.JSONDecodeError as e:
                print(f"Error decoding JSON from PR list page {page-1}: {e}")
                print(f"Response text: {response.text[:200]}...")  
                break  
            except Exception as e:
                 print(f"An unexpected error occurred while processing PRs on page {page-1}: {e}")
                 break  

        else:  
            print(f"Stopping pagination. Received status {response.status_code} for PR list page {page-1}.")
            break  

    print(f"\n--- Finished STRUCTURED processing for {owner}/{repo}. ---")
    print(f"--- Processed {len(processed_pr_numbers)} pull requests. ---")
    print(f"--- Data saved in subdirectories within: {OUTPUT_DIR_BASE} ---")
    return processed_pr_numbers  


 
class PRSpecificRAG:

    def __init__(self, data_path="pull_request_data_structured"):

        self.data_path = data_path
         
        print("Initializing embeddings model...")
        try:
            self.embeddings = HuggingFaceEmbeddings(
                model_name="microsoft/graphcodebert-base",
                model_kwargs={"trust_remote_code": True}  
            )
            print("Embeddings model initialized.")
        except Exception as e:
            print(f"Error initializing embeddings model: {str(e)}")
            self.embeddings = None  
             
             


         
        self.splitter = RecursiveCharacterTextSplitter.from_language(
            language=Language.PYTHON,
            chunk_size=2048,  
            chunk_overlap=50  
        )
        self.pr_databases = {}  
        self.llm = None  

         
        self._temp_chroma_dirs = {}

    def __del__(self):

        print("Cleaning up temporary Chroma directories...")
        for pr_number, temp_dir in self._temp_chroma_dirs.items():
            try:
                if os.path.exists(temp_dir):
                    shutil.rmtree(temp_dir)
                    print(f"Cleaned up temporary directory for PR {pr_number}: {temp_dir}")
            except Exception as e:
                print(f"Error cleaning up temporary directory {temp_dir} for PR {pr_number}: {e}")


    def _load_pr_metadata(self, pr_dir):

        metadata_path = os.path.join(pr_dir, "metadata.json")
        if not os.path.exists(metadata_path):
            raise FileNotFoundError(f"Metadata file not found for PR in {pr_dir}")
        with open(metadata_path, "r", encoding='utf-8') as f:
            return json.load(f)

    def _process_single_pr(self, pr_dir_name):

        pr_number_str = pr_dir_name.split("_")[-1]  
        try:
            pr_number = int(pr_number_str)
        except ValueError:
            print(f"Warning: Could not parse PR number from directory name: {pr_dir_name}. Skipping.")
            return None  

        full_path = os.path.join(self.data_path, pr_dir_name)
        if not os.path.isdir(full_path):
             print(f"Warning: PR directory not found or is not a directory: {full_path}. Skipping.")
             return None  


        try:
             
            metadata = self._load_pr_metadata(full_path)
        except FileNotFoundError as e:
            print(f"Error loading metadata for PR {pr_number_str}: {e}. Skipping.")
            return None
        except json.JSONDecodeError as e:
             print(f"Error decoding metadata JSON for PR {pr_number_str}: {e}. Skipping.")
             return None
        except Exception as e:
             print(f"An unexpected error occurred loading metadata for PR {pr_number_str}: {e}. Skipping.")
             return None

        if self.embeddings is None:
             print(f"Skipping vector DB creation for PR {pr_number_str}: Embeddings model not initialized.")
             return None

        chunks = []
         
        changed_files = metadata.get("changed_files_manifest", [])
        if not changed_files:
             print(f"No changed files found in metadata for PR {pr_number_str}.")
             pass  

        for file_meta in tqdm(changed_files, desc=f"Processing files for PR {pr_number_str}"):
             
            if not isinstance(file_meta, dict) or 'filename' not in file_meta:
                 print(f"Warning: Skipping invalid file metadata entry for PR {pr_number_str}: {file_meta}")
                 continue

            filename = file_meta["filename"]
            try:
                 
                before_code = self._read_code_file(full_path, "before_merge", filename)
                after_code = self._read_code_file(full_path, "after_merge", filename)
                patch = self._read_patch_file(full_path, filename)

                 
                context = self._create_context(metadata, filename, before_code, after_code, patch)

                 
                file_chunks = self.splitter.split_text(context)

                 
                chunks.extend(file_chunks)

            except Exception as e:
                print(f"Error processing file {filename} in PR {pr_number_str}: {str(e)}")

         
         
        pr_body = metadata.get("body")
        if pr_body:
             body_chunks = self.splitter.split_text(f"PR Body:\n{pr_body}")
             chunks.extend(body_chunks)


        issue_comments = metadata.get("issue_comments", [])
        for comment in issue_comments:
             if isinstance(comment, dict) and comment.get("body"):
                  comment_chunks = self.splitter.split_text(f"Issue Comment by {comment.get('user', 'N/A')}:\n{comment.get('body')}")
                  chunks.extend(comment_chunks)


        review_comments = metadata.get("review_comments", [])
        for comment in review_comments:
             if isinstance(comment, dict) and comment.get("body"):
                  comment_chunks = self.splitter.split_text(f"Review Comment by {comment.get('user', 'N/A')} on {comment.get('path', 'N/A')}:\n{comment.get('body')}")
                  chunks.extend(comment_chunks)


        if not chunks:
            print(f"No processable content found for PR {pr_number_str}. Skipping vector DB creation.")
            return None  

         
         
        try:
            temp_dir = tempfile.mkdtemp(prefix=f"chroma_db_pr_{pr_number_str}_")
            self._temp_chroma_dirs[pr_number_str] = temp_dir  

            vector_db = Chroma.from_texts(
                texts=chunks,
                embedding=self.embeddings,
                persist_directory=temp_dir  
            )
            print(f"Successfully created Chroma DB for PR {pr_number_str} in temporary directory: {temp_dir}")
        except Exception as e:
            print(f"Error creating Chroma DB for PR {pr_number_str}: {str(e)}")
             
            if pr_number_str in self._temp_chroma_dirs:
                 try:
                     shutil.rmtree(self._temp_chroma_dirs[pr_number_str])
                     del self._temp_chroma_dirs[pr_number_str]
                 except Exception as cleanup_e:
                     print(f"Error during cleanup of temp dir {temp_dir}: {cleanup_e}")

            return None  

         
        self.pr_databases[pr_number_str] = vector_db  
        return vector_db

    def _read_code_file(self, pr_path, dir_name, filename):

        file_path = os.path.join(pr_path, dir_name, filename)
        if os.path.exists(file_path):
            try:
                with open(file_path, "r", encoding='utf-8') as f:
                    return f.read()
            except Exception as e:
                print(f"Error reading file {file_path}: {str(e)}")
                return ""  
        return ""  

    def _read_patch_file(self, pr_path, filename):

        patch_path = os.path.join(pr_path, "changed_files", filename + ".patch")
        if os.path.exists(patch_path):
            try:
                with open(patch_path, "r", encoding='utf-8') as f:
                    return f.read()
            except Exception as e:
                print(f"Error reading patch file {patch_path}: {str(e)}")
                return ""  
        return ""  

    def _create_context(self, metadata, filename, before_code, after_code, patch):
         
        file_review_comments = [
            c.get('body') for c in metadata.get('review_comments', [])
            if isinstance(c, dict) and c.get('path') == filename and c.get('body')  
        ]
        comments_text = "\n".join(file_review_comments) if file_review_comments else "No specific review comments for this file."

         
        ci_checks_list = [c.get('name', 'N/A') for c in metadata.get('check_runs', []) if isinstance(c, dict)]
        ci_checks_str = ", ".join(ci_checks_list) if ci_checks_list else "No CI checks found."


        context = (
            f"--- Pull Request {metadata.get('pr_number', 'N/A')} - {metadata.get('title', 'N/A')} ---\n"
            f"Author: {metadata.get('author_login', 'N/A')}\n"
            f"File: {filename}\n"
            f"Status: {metadata.get('state', 'N/A')}\n"
            f"CI Checks for head commit: {ci_checks_str}\n\n"
            f"BEFORE CODE:\n{before_code}\n\n"
            f"AFTER CODE:\n{after_code}\n\n"
            f"DIFF:\n{patch}\n\n"
            f"REVIEW COMMENTS on this file:\n{comments_text}\n"
        )
        return context

    def initialize_llm(self):

        if self.llm is not None:
            print("LLM already initialized.")
            return

         
        model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  
        print(f"Initializing LLM: {model_name}...")
        try:
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            model = AutoModelForCausalLM.from_pretrained(model_name)

            text_gen_pipeline = pipeline(
                "text-generation",
                model=model,
                tokenizer=tokenizer,
                temperature=0.1,  
                max_new_tokens=512,  
                repetition_penalty=1.1  
                 
            )

            self.llm = HuggingFacePipeline(pipeline=text_gen_pipeline)
            print("LLM initialized successfully.")
        except Exception as e:
            print(f"Error initializing LLM: {str(e)}")
            self.llm = None  
             
             


    def load_pr(self, pr_number):

        pr_number_str = str(pr_number)  
        pr_dir_name = f"pr_{pr_number_str}"  
        pr_dir_path = os.path.join(self.data_path, pr_dir_name)

        if not os.path.exists(pr_dir_path):
            print(f"Error: Data directory for PR {pr_number_str} not found at {pr_dir_path}")
            return None

         
         
        vector_db = self._process_single_pr(pr_dir_name)
        return vector_db  

    def get_review(self, pr_number, question):

        pr_number_str = str(pr_number)  

        try:
            if self.llm is None:
                 raise ValueError("LLM is not initialized. Call initialize_llm() first.")

             
            if pr_number_str not in self.pr_databases or self.pr_databases[pr_number_str] is None:
                print(f"PR {pr_number_str} data not loaded. Attempting to load...")
                loaded_db = self.load_pr(pr_number_str)  
                if loaded_db is None:
                    raise ValueError(f"Failed to load data for PR {pr_number_str}. Cannot perform RAG analysis.")

             
            retriever = self.pr_databases[pr_number_str].as_retriever(search_kwargs={"k": 2})

             
            source_documents = retriever.get_relevant_documents(question)

             
            context_text = "\n\n---\n\n".join([doc.page_content for doc in source_documents])

             
            pr_dir_name = f"pr_{pr_number_str}"
            metadata = self._load_pr_metadata(os.path.join(self.data_path, pr_dir_name))
            ci_checks_list = [c.get('name', 'N/A') for c in metadata.get('check_runs', []) if isinstance(c, dict)]
            ci_checks_str = ", ".join(ci_checks_list) if ci_checks_list else "No CI checks found."

             
            prompt_template = """<|system|>
            You are a helpful assistant specializing in code review analysis.
            You are analyzing Pull Request {pr_number}. Relevant context from the PR is provided below:

            {context}

            Consider the following aspects from the PR data:
            1. Code changes (diff)
            2. Developer comments (issue and review comments)
            3. Results of CI checks: {ci_checks}
            4. Commit history (summarized in context)

            Based on the provided context, answer the user's question about the Pull Request.
            If the context does not contain enough information to answer the question,
            state that you cannot answer based on the available information.
            </s>
            <|user|>
            {question}
            </s>
            <|assistant|>
            """

             
            print(f"--- Debugging Prompt Variables for PR {pr_number_str} ---")
            print(f"pr_number_str: {pr_number_str}")
            print(f"question: {question}")
            print(f"ci_checks_str: {ci_checks_str}")
            print(f"context_text (first 200 chars): {context_text[:200]}...")
            print("--- End Debugging Print Statements ---")


             
            formatted_prompt = prompt_template.format(
                pr_number=pr_number_str,
                context=context_text,
                ci_checks=ci_checks_str,
                question=question
            )

             
             
            llm_response = self.llm.invoke(formatted_prompt)

             
             
             
             
            answer = llm_response.split("<|assistant|>")[-1].strip()


             
            formatted_sources = self._format_sources(source_documents)

            return {
                "pr": pr_number_str,  
                "question": question,
                "answer": answer if answer else "Could not generate an answer based on the available information.",
                "sources": formatted_sources
            }
        except ValueError as e:
             
            return {
                "pr": pr_number_str,
                "question": question,
                "answer": f"Error processing PR {pr_number_str}: {e}",
                "sources": []
            }
        except Exception as e:
            return {
                "pr": pr_number_str,
                "question": question,
                "answer": f"An unexpected error occurred during review generation: {str(e)}",
                "sources": []
            }


    def _format_sources(self, docs):

        formatted_sources = []
        for doc in docs:
             
             
             
            source_info = {
                "content_snippet": str(doc.page_content)[:200] + "..." if doc and hasattr(doc, 'page_content') else "N/A",  
                "file": "Unknown (metadata not stored)",  
                "checks": "Unknown (metadata not stored)",
                "author": "Unknown (metadata not stored)"
            }
            formatted_sources.append(source_info)
        return formatted_sources


 
if __name__ == "__main__":
     
     
    owner = 'AlfaInsurance'
    repo = 'devQ_testData_PythonProject'

     
     
    pr_state = 'all'

     
     
    print(f"Starting STRUCTURED full pull request data fetch for {owner}/{repo}")
    print(f"Target PR state: {pr_state}")
    print(f"Output directory: {OUTPUT_DIR_BASE}")
    print("Ensure GITHUB_BOT_ACCESS_TOKEN environment variable is set.")
    print("WARNING: This can take a long time and consume significant disk space and API calls.")

    start_time = time.time()
     
    if github_secret:
        processed_prs = get_all_pull_requests_structured(
            owner,
            repo,
            state=pr_state
        )
    else:
        processed_prs = []
        print("Skipping data fetch because GITHUB_BOT_ACCESS_TOKEN is not set.")

    end_time = time.time()

    if processed_prs:
        print(f"\n--------------------------------------------------")
        print(f"Successfully finished processing.")
        print(f"Processed {len(processed_prs)} pull requests.")
        print(f"Data saved in '{OUTPUT_DIR_BASE}' directory, organized by PR number.")
        print(f"Total execution time: {end_time - start_time:.2f} seconds")
        print(f"--------------------------------------------------")
    else:
        print("\nNo pull requests processed during fetch.")
        print(f"Total execution time: {end_time - start_time:.2f} seconds")


     
    print("\n--- Starting RAG Analysis ---")
    rag_system = PRSpecificRAG()
    rag_system.initialize_llm()  

     
     
     
     
     
    pr_numbers_to_analyze = [str(pr_num) for pr_num in processed_prs]  

    questions = [
        "Какие потенциальные уязвимости есть в этих изменениях?",
        "Соответствует ли код стандартам проекта?",
        "Есть ли проблемы с производительностью в измененном коде?",
        "Summarize the main changes in PR 1.",  
        "What were the CI check results for PR 2?"  
    ]

    if not pr_numbers_to_analyze:
        print("No PRs were successfully processed during data fetch. Cannot perform RAG analysis.")
    else:
        for pr_num in pr_numbers_to_analyze:
            print(f"\n\033[1m--- Analyzing PR {pr_num} ---\033[0m")
            try:
                 
                 
                 
                for q in questions:
                     print(f"\nВопрос: {q}")
                     result = rag_system.get_review(pr_num, q)
                     print(f"Ответ: {result['answer']}")
                     print(f"Источники: {result['sources']}")
            except Exception as e:
                print(f"An error occurred during analysis of PR {pr_num}: {str(e)}")

    print("\n--- RAG Analysis Finished ---")




Starting STRUCTURED full pull request data fetch for AlfaInsurance/devQ_testData_PythonProject
Target PR state: all
Output directory: pull_request_data_structured
Ensure GITHUB_BOT_ACCESS_TOKEN environment variable is set.
--- Starting STRUCTURED pull request data fetch for AlfaInsurance/devQ_testData_PythonProject ---
--- Output base directory: pull_request_data_structured ---

Fetching page 1 of pull requests list from https://api.github.com/repos/AlfaInsurance/devQ_testData_PythonProject/pulls...
Processing 2 pull requests from page 1...

--- Processing PR #1: Hackaton ---
    Updated at: 2025-04-14T09:00:47Z
    Fetching full details for PR #1...
    Fetching changed files for PR #1...
      Fetching page 1 from https://api.github.com/repos/AlfaInsurance/devQ_testData_PythonProject/pulls/1/files...
    Processing 16 files for PR #1...
      Processing file: .gitlab-ci.yml (Status: removed)
        Fetching content: .gitlab-ci.yml @ c916a6d
      Processing file: accesslist/template

Some weights of RobertaModel were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings model initialized.
Initializing LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0...
Cleaning up temporary Chroma directories...


Device set to use cuda:0


LLM initialized successfully.

[1m--- Analyzing PR #1 ---[0m

Вопрос: Какие потенциальные уязвимости есть в этих изменениях?
PR #1 data not loaded. Attempting to load...
Processing data for PR #1...


Processing files for PR #1: 100%|██████████| 16/16 [00:00<00:00, 252.37it/s]


Successfully created Chroma DB for PR #1 in temporary directory: /tmp/chroma_db_pr_1__ygkg6sb
--- Debugging Prompt Variables for PR #1 ---
pr_number_str: 1
question: Какие потенциальные уязвимости есть в этих изменениях?
ci_checks_str: No CI checks found.
context_text (first 200 chars): def task(request, acl_id) -> bool:
    return
    """Функция обработки запросов на выполнение активностей для выполнения обращения"""
    logger.info(
        f"[Отправка в omni] Начинается выполнение...
--- End Debugging Print Statements ---




Ответ: 1. Code changes (diff): The PR contains no code changes. Therefore, there are no potential vulnerabilities in these changes.

            2. Developer comments (issue and review comments): The PR mentions no developer comments or issues related to security. Therefore, there are no potential vulnerabilities in these comments.

            3. Results of CI checks: The PR does not mention any CI checks. Therefore, there are no potential vulnerabilities in these checks.

            4. Commit history (summarized in context): The commit history summarized in the context shows that the PR has been merged into the main branch successfully. Therefore, there are no potential vulnerabilities in this context.
Источники: [{'content_snippet': 'def task(request, acl_id) -> bool:\n    return\n    """Функция обработки запросов на выполнение активностей для выполнения обращения"""\n    logger.info(\n        f"[Отправка в omni] Начинается выполнение...', 'file': 'Unknown (metadata not stored)', '



Ответ: Yes, the code in the given PR meets the project standards. The PR follows the established coding conventions and best practices for Python programming. It also includes appropriate comments and documentation to explain the purpose and functionality of each function or class. Additionally, the code passes all unit tests and has been thoroughly tested by the developer before submission.
Источники: [{'content_snippet': 'def task(request, acl_id) -> bool:\n    return\n    """Функция обработки запросов на выполнение активностей для выполнения обращения"""\n    logger.info(\n        f"[Отправка в omni] Начинается выполнение...', 'file': 'Unknown (metadata not stored)', 'checks': 'Unknown (metadata not stored)', 'author': 'Unknown (metadata not stored)'}, {'content_snippet': 'def task(request, acl_id) -> bool:\n    return\n    """Функция обработки запросов на выполнение активностей для выполнения обращения"""\n    logger.info(\n        f"[Отправка в omni] Начинается выполнение...', 'fi



Ответ: Да, есть возможность обнаружить некоторые проблемы с производительностью в измененном коде. Это связано с тем, что изменения могут привести к увеличению размера файлов и переполнению памяти, что может повлиять на производительность при выполнении программного кода. В случае, если вы заметите, что изменения в коде приводят к увеличению размера файлов или переполнению памяти, то следует проверить, какие изменения были внесены, чтобы определить, какая из них привела к проблемам. Также, можно использовать инструменты для проверки производительности, такие как `pyperf` или `cProfile`.
Источники: [{'content_snippet': 'def task(request, acl_id) -> bool:\n    return\n    """Функция обработки запросов на выполнение активностей для выполнения обращения"""\n    logger.info(\n        f"[Отправка в omni] Начинается выполнение...', 'file': 'Unknown (metadata not stored)', 'checks': 'Unknown (metadata not stored)', 'author': 'Unknown (metadata not stored)'}, {'content_snippet': 'def task(reque



Ответ: PR 1 made significant changes to the base template file and added a new template file with additional features. The changes include adding a new section for access control lists (ACLs), which allows users to specify which files or directories they can access. Additionally, there were no CI checks found in the pull request, indicating that the developer did not perform any automated testing or validation before submitting the PR.
Источники: [{'content_snippet': '--- Pull Request #1 - Hackaton ---\nAuthor: VasilevArtem\nFile: templates/base.html\nStatus: closed\nCI Checks for head commit: No CI checks found....', 'file': 'Unknown (metadata not stored)', 'checks': 'Unknown (metadata not stored)', 'author': 'Unknown (metadata not stored)'}, {'content_snippet': '--- Pull Request #1 - Hackaton ---\nAuthor: VasilevArtem\nFile: accesslist/templates/acl_demo.html\nStatus: closed\nCI Checks for head commit: No CI checks found....', 'file': 'Unknown (metadata not stored)', 'checks': 'Unkno



Ответ: The CI check results for PR 2 are not provided in the given text. Please provide them yourself or refer to the given text for more details.
Источники: [{'content_snippet': '--- Pull Request #1 - Hackaton ---\nAuthor: VasilevArtem\nFile: templates/base.html\nStatus: closed\nCI Checks for head commit: No CI checks found....', 'file': 'Unknown (metadata not stored)', 'checks': 'Unknown (metadata not stored)', 'author': 'Unknown (metadata not stored)'}, {'content_snippet': '--- Pull Request #1 - Hackaton ---\nAuthor: VasilevArtem\nFile: accesslist/templates/acl_demo.html\nStatus: closed\nCI Checks for head commit: No CI checks found....', 'file': 'Unknown (metadata not stored)', 'checks': 'Unknown (metadata not stored)', 'author': 'Unknown (metadata not stored)'}]

[1m--- Analyzing PR #2 ---[0m

Вопрос: Какие потенциальные уязвимости есть в этих изменениях?
PR #2 data not loaded. Attempting to load...
Processing data for PR #2...


Processing files for PR #2: 100%|██████████| 16/16 [00:00<00:00, 365.46it/s]


Successfully created Chroma DB for PR #2 in temporary directory: /tmp/chroma_db_pr_2_uzi57pen
--- Debugging Prompt Variables for PR #2 ---
pr_number_str: 2
question: Какие потенциальные уязвимости есть в этих изменениях?
ci_checks_str: No CI checks found.
context_text (first 200 chars): def task(request, acl_id) -> bool:
    return
    """Функция обработки запросов на выполнение активностей для выполнения обращения"""
    logger.info(
        f"[Отправка в omni] Начинается выполнение...
--- End Debugging Print Statements ---




Ответ: 1. Code changes (diff): The PR contains no code changes. Therefore, there are no potential vulnerabilities in these changes.

            2. Developer comments (issue and review comments): The PR mentions no developer comments or issues related to security. Therefore, there are no potential vulnerabilities in these comments.

            3. Results of CI checks: The PR does not mention any CI checks. Therefore, there are no potential vulnerabilities in these checks.

            4. Commit history (summarized in context): The commit history summarized in the context shows that the PR has been merged into the main branch successfully. Therefore, there are no potential vulnerabilities in this context.
Источники: [{'content_snippet': 'def task(request, acl_id) -> bool:\n    return\n    """Функция обработки запросов на выполнение активностей для выполнения обращения"""\n    logger.info(\n        f"[Отправка в omni] Начинается выполнение...', 'file': 'Unknown (metadata not stored)', '



Ответ: Yes, the code in the given PR meets the project standards. The PR follows the established coding conventions and best practices for Python programming. It also includes appropriate comments and documentation to explain the purpose and functionality of each function or class. Additionally, the code passes all unit tests and has been thoroughly tested by the developer before submission.
Источники: [{'content_snippet': 'def task(request, acl_id) -> bool:\n    return\n    """Функция обработки запросов на выполнение активностей для выполнения обращения"""\n    logger.info(\n        f"[Отправка в omni] Начинается выполнение...', 'file': 'Unknown (metadata not stored)', 'checks': 'Unknown (metadata not stored)', 'author': 'Unknown (metadata not stored)'}, {'content_snippet': 'def task(request, acl_id) -> bool:\n    return\n    """Функция обработки запросов на выполнение активностей для выполнения обращения"""\n    logger.info(\n        f"[Отправка в omni] Начинается выполнение...', 'fi



Ответ: Да, есть возможность заметить некоторые проблемы с производительностью в измененном коде. Это может быть связано с различными причинами, такими как:

1. Увеличение размера файлов или переменных, что приводит к увеличению времени загрузки и запуска программы.
2. Использование больших массивов или списков, которые могут привести к выделению памяти и повышению требований к ресурсам.
3. Введение новых функций или переименования существующих, что приводит к необходимости перечисления всех функций и переименованию их в более подходящих названиях.
4. Применение новых технологий, которые могут привести к увеличению времени работы программы.
5. Ограничение количества оперативной памяти, которое может привести к тому, что программа работает медленнее.

Если вы заметите эти проблемы, то следует проверить, какие из них возникают в изменённом коде, чтобы определить, какая из них является основной причиной. После этого можно принять меры по устранению этих проблем, чтобы повысить производител



Ответ: PR 1 made significant changes to the base template file, including adding new variables and modifying existing ones. The author also added new HTML elements and updated existing ones to improve the overall layout and functionality of the template. Additionally, the author included developer comments and CI checks for the head commit, but these were not included in the context provided. Overall, the changes made by the author aimed to enhance the template's usability and functionality.
Источники: [{'content_snippet': '--- Pull Request #2 - v1 ---\nAuthor: VasilevArtem\nFile: templates/base.html\nStatus: closed\nCI Checks for head commit: No CI checks found....', 'file': 'Unknown (metadata not stored)', 'checks': 'Unknown (metadata not stored)', 'author': 'Unknown (metadata not stored)'}, {'content_snippet': '--- Pull Request #2 - v1 ---\nAuthor: VasilevArtem\nFile: accesslist/templates/acl_demo.html\nStatus: closed\nCI Checks for head commit: No CI checks found....', 'file': 'Unk



Ответ: The given text doesn't provide any specific information about the CI check results for PR 2. It only mentions that there are no CI checks found for the head commit.
Источники: [{'content_snippet': '--- Pull Request #2 - v1 ---\nAuthor: VasilevArtem\nFile: templates/base.html\nStatus: closed\nCI Checks for head commit: No CI checks found....', 'file': 'Unknown (metadata not stored)', 'checks': 'Unknown (metadata not stored)', 'author': 'Unknown (metadata not stored)'}, {'content_snippet': '--- Pull Request #2 - v1 ---\nAuthor: VasilevArtem\nFile: accesslist/templates/acl_demo.html\nStatus: closed\nCI Checks for head commit: No CI checks found....', 'file': 'Unknown (metadata not stored)', 'checks': 'Unknown (metadata not stored)', 'author': 'Unknown (metadata not stored)'}]

--- RAG Analysis Finished ---
